Files
xc-llm-ascend/tests/ut/quantization/test_modelslim_config.py

255 lines
11 KiB
Python
Raw Normal View History

import json
import os
import tempfile
from unittest.mock import MagicMock, patch
from vllm.model_executor.layers.fused_moe import FusedMoE
[V1] MTP supports torchair (#2145) ### What this PR does / why we need it? Support MTP with: - [x] V0 Scheduler - [x] TorchAir - [x] Single DP - [x] Multi DP - [x] Disaggregate PD Known issues: - [ ] Not support V1 Scheduler (chunked prefill), will be supported in a few weeks - [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now, need to comment out the line 171-175 in file `vllm/vllm/v1/metrics/loggers.py` ``` if (len(self.engine_indexes) > 1 and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") ``` To start an online server with torchair enabled, here is an example: ``` python -m vllm.entrypoints.openai.api_server \ --model="/weights/DeepSeek-R1_w8a8/" \ --trust-remote-code \ --max-model-len 40000 \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --enable_expert_parallel \ --served-model-name deepseekr1 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --quantization ascend \ --host 0.0.0.0 \ --port 1234 \ --additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \ --gpu_memory_utilization 0.9 ``` offline example with torchair enabled ``` from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=16, temperature=0) # Create an LLM. llm = LLM( model="/home/data/DeepSeek-R1_w8a8/", tensor_parallel_size=16, max_num_seqs=16, gpu_memory_utilization=0.9, distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ "method": "deepseek_mtp", "num_speculative_tokens": 1, }, trust_remote_code=True, enforce_eager=False, max_model_len=2000, additional_config = { 'torchair_graph_config': { 'enabled': True, "graph_batch_sizes": [16], 'enable_multistream_shared_expert': False, }, "ascend_scheduler_config": { "enabled": True }, # 'expert_tensor_parallel_size': 16, } ) # Generate texts from the prompts. # llm.start_profile() outputs = llm.generate(prompts, sampling_params) # llm.stop_profile() for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.linear import LinearBase
from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
[Feature][Quant] Auto-detect quantization format from model files (#6645) ## Summary - Add automatic quantization format detection, eliminating the need to manually specify `--quantization` when serving quantized models. - The detection inspects only lightweight JSON files (`quant_model_description.json` and `config.json`) at engine initialization time, with no `.safetensors` reads. - User-explicit `--quantization` flags are always respected; auto-detection only applies when the flag is omitted. ## Details **Detection priority:** 1. `quant_model_description.json` exists → `quantization="ascend"` (ModelSlim) 2. `config.json` contains `"quant_method": "compressed-tensors"` → `quantization="compressed-tensors"` (LLM-Compressor) 3. Neither → default float behavior **Technical approach:** Hooked into `NPUPlatform.check_and_update_config()` to run detection after `VllmConfig.__post_init__`. Since `quant_config` is already `None` at that point, we explicitly recreate it via `VllmConfig._get_quantization_config()` to trigger the full quantization initialization pipeline. ## Files Changed | File | Description | |------|-------------| | `vllm_ascend/quantization/utils.py` | Added `detect_quantization_method()` and `maybe_auto_detect_quantization()` | | `vllm_ascend/platform.py` | Integrated auto-detection in `check_and_update_config()` | | `vllm_ascend/quantization/modelslim_config.py` | Improved error handling for weight loading | - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-02-26 10:59:25 +08:00
from vllm_ascend.quantization.modelslim_config import (
MODELSLIM_CONFIG_FILENAME,
[Feature][Quant] Auto-detect quantization format from model files (#6645) ## Summary - Add automatic quantization format detection, eliminating the need to manually specify `--quantization` when serving quantized models. - The detection inspects only lightweight JSON files (`quant_model_description.json` and `config.json`) at engine initialization time, with no `.safetensors` reads. - User-explicit `--quantization` flags are always respected; auto-detection only applies when the flag is omitted. ## Details **Detection priority:** 1. `quant_model_description.json` exists → `quantization="ascend"` (ModelSlim) 2. `config.json` contains `"quant_method": "compressed-tensors"` → `quantization="compressed-tensors"` (LLM-Compressor) 3. Neither → default float behavior **Technical approach:** Hooked into `NPUPlatform.check_and_update_config()` to run detection after `VllmConfig.__post_init__`. Since `quant_config` is already `None` at that point, we explicitly recreate it via `VllmConfig._get_quantization_config()` to trigger the full quantization initialization pipeline. ## Files Changed | File | Description | |------|-------------| | `vllm_ascend/quantization/utils.py` | Added `detect_quantization_method()` and `maybe_auto_detect_quantization()` | | `vllm_ascend/platform.py` | Integrated auto-detection in `check_and_update_config()` | | `vllm_ascend/quantization/modelslim_config.py` | Improved error handling for weight loading | - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-02-26 10:59:25 +08:00
AscendModelSlimConfig,
)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
[main2main] upgrade vllm main 0202 (#6560) ### What this PR does / why we need it? 1. Fix `TypeError: FusedMoEParallelConfig.__init__() missing 1 required positional argument: 'is_sequence_parallel'` due to https://github.com/vllm-project/vllm/pull/32567 2. Fix ` TypeError: '>' not supported between instances of 'MagicMock' and 'int'` due to https://github.com/vllm-project/vllm/pull/33035 3. Fix `TypeError: Can't instantiate abstract class AscendMLAImpl with abstract methods forward_mha, forward_mqa` and AttributeError: 'bool' object has no attribute 'process_weights_after_loading' due to https://github.com/vllm-project/vllm/pull/33284 4. Fix `'AscendSharedFusedMoE' object has no attribute '_routed_input_transform'`due to https://github.com/vllm-project/vllm/pull/32790 5. Fix `NPUModelRunner._dummy_run() got an unexpected keyword argument 'num_active_loras'` due to https://github.com/vllm-project/vllm/pull/32005 6. Fix the problem caused by` 'tuple' object has no attribute 'job_id'` due to https://github.com/vllm-project/vllm/pull/27492 7. Fix the problem that all_moe_layers is not equal to vllm.moe_forward, vllm.moe_forward_shared due to https://github.com/vllm-project/vllm/pull/33184 8. Add patch to fix the problem "got multiple values for keyword argument 'add_special_tokens'" due to https://github.com/vllm-project/vllm/pull/32863 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2026-02-05 19:31:17 +08:00
from vllm.model_executor.layers.attention import Attention
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
class TestAscendModelSlimConfig(TestBase):
def setUp(self):
self.sample_config = {
"weight": "INT8",
"fa_quant_type": "C8",
"layer1.weight": "INT8",
"layer2.weight": "FLOAT",
"fused_layer.weight": "FLOAT",
"fused_layer.shard1.weight": "FLOAT",
"fused_layer.shard2.weight": "FLOAT",
"shard1.weight": "FLOAT",
"shard2.weight": "FLOAT",
}
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
self.ascend_config = AscendModelSlimConfig(self.sample_config)
self.ascend_config.packed_modules_mapping = None
def test_init(self):
self.assertEqual(self.ascend_config.quant_description,
self.sample_config)
def test_repr(self):
repr_str = repr(self.ascend_config)
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
self.assertTrue(repr_str.startswith("AscendModelSlimConfig:\n"))
def test_get_name(self):
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
self.assertEqual(AscendModelSlimConfig.get_name(),
ASCEND_QUANTIZATION_METHOD)
def test_get_supported_act_dtypes(self):
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
supported_dtypes = AscendModelSlimConfig.get_supported_act_dtypes()
self.assertEqual(len(supported_dtypes), 3)
def test_get_min_capability(self):
with self.assertRaises(NotImplementedError):
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
AscendModelSlimConfig.get_min_capability()
def test_get_config_filenames(self):
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
filenames = AscendModelSlimConfig.get_config_filenames()
self.assertEqual(filenames, [])
def test_from_config(self):
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
config = AscendModelSlimConfig.from_config(self.sample_config)
self.assertIsInstance(config, AscendModelSlimConfig)
self.assertEqual(config.quant_description, self.sample_config)
@patch('torch.npu.is_available')
def test_override_quantization_method(self, mock_is_available):
# Test when NPU is available
mock_is_available.return_value = True
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
result = AscendModelSlimConfig.override_quantization_method(None, None)
self.assertIsNone(result)
hf_quant_cfg = {"quant_method": ""}
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
result = AscendModelSlimConfig.override_quantization_method(
hf_quant_cfg, None)
self.assertEqual(result, "ascend")
# Test when NPU is not available
mock_is_available.return_value = False
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
result = AscendModelSlimConfig.override_quantization_method(None, None)
self.assertIsNone(result)
hf_quant_cfg = {"quant_method": ""}
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
result = AscendModelSlimConfig.override_quantization_method(
hf_quant_cfg, None)
self.assertIsNone(result)
def test_get_quant_method_for_linear(self):
mock_config = MagicMock()
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_config.model_config.hf_config.model_type = None
linear_layer = MagicMock(spec=LinearBase)
# Test skipped layer
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
with patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
patch.object(self.ascend_config, \
'is_layer_skipped_ascend',
return_value=True):
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
self.assertIsInstance(method, AscendUnquantizedLinearMethod)
# Test quantized layer
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_scheme = MagicMock()
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
patch('vllm_ascend.quantization.method_adapters.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
self.assertIs(method, mock_ascend_linear.return_value)
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_ascend_linear.assert_called_once_with(mock_scheme)
def test_get_quant_method_for_attention(self):
attention_layer = MagicMock(spec=Attention)
mock_config = MagicMock()
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_config.model_config.hf_config.model_type = None
mock_scheme = MagicMock()
with patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
patch('vllm_ascend.quantization.method_adapters.AscendKVCacheMethod', \
return_value=MagicMock()) as mock_ascend_kvcache:
# Test with fa_quant_type
method = self.ascend_config.get_quant_method(
attention_layer, ".attn")
self.assertIs(method, mock_ascend_kvcache.return_value)
def test_get_quant_method_for_fused_moe(self):
fused_moe_layer = MagicMock(spec=FusedMoE)
[V1] MTP supports torchair (#2145) ### What this PR does / why we need it? Support MTP with: - [x] V0 Scheduler - [x] TorchAir - [x] Single DP - [x] Multi DP - [x] Disaggregate PD Known issues: - [ ] Not support V1 Scheduler (chunked prefill), will be supported in a few weeks - [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now, need to comment out the line 171-175 in file `vllm/vllm/v1/metrics/loggers.py` ``` if (len(self.engine_indexes) > 1 and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") ``` To start an online server with torchair enabled, here is an example: ``` python -m vllm.entrypoints.openai.api_server \ --model="/weights/DeepSeek-R1_w8a8/" \ --trust-remote-code \ --max-model-len 40000 \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --enable_expert_parallel \ --served-model-name deepseekr1 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --quantization ascend \ --host 0.0.0.0 \ --port 1234 \ --additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \ --gpu_memory_utilization 0.9 ``` offline example with torchair enabled ``` from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=16, temperature=0) # Create an LLM. llm = LLM( model="/home/data/DeepSeek-R1_w8a8/", tensor_parallel_size=16, max_num_seqs=16, gpu_memory_utilization=0.9, distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ "method": "deepseek_mtp", "num_speculative_tokens": 1, }, trust_remote_code=True, enforce_eager=False, max_model_len=2000, additional_config = { 'torchair_graph_config': { 'enabled': True, "graph_batch_sizes": [16], 'enable_multistream_shared_expert': False, }, "ascend_scheduler_config": { "enabled": True }, # 'expert_tensor_parallel_size': 16, } ) # Generate texts from the prompts. # llm.start_profile() outputs = llm.generate(prompts, sampling_params) # llm.stop_profile() for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
mock_config = MagicMock()
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_config.model_config.hf_config.model_type = None
# Test skipped layer
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.ops.fused_moe.fused_moe.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
method = self.ascend_config.get_quant_method(
fused_moe_layer, "moe_layer")
self.assertIs(method, mock_ascend_moe.return_value)
# Test quantized layer
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
mock_scheme = MagicMock()
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_config), \
patch("vllm_ascend.quantization.modelslim_config.create_scheme_for_layer", return_value=mock_scheme), \
patch('vllm_ascend.quantization.method_adapters.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
method = self.ascend_config.get_quant_method(
fused_moe_layer, "moe_layer")
self.assertIs(method, mock_ascend_moe.return_value)
def test_is_layer_skipped_ascend(self):
# Test non-fused layer that should be quantized
self.assertFalse(self.ascend_config.is_layer_skipped_ascend("layer1"))
# Test non-fused layer that should be skipped
self.assertTrue(self.ascend_config.is_layer_skipped_ascend("layer2"))
# Test fused layer
fused_mapping = {"fused_layer": ["shard1", "shard2"]}
self.assertTrue(
self.ascend_config.is_layer_skipped_ascend("fused_layer",
fused_mapping))
# Test inconsistent fused layer shards
bad_config = {"shard1.weight": "FLOAT", "shard2.weight": "INT8"}
[Refactor] Quantization Module Refactor (#5738) ### Summary This PR refactors the `vllm_ascend/quantization` module to improve code organization, maintainability, and extensibility. The refactoring introduces a clear separation of concerns with a registry-based scheme discovery pattern, abstract base classes for quantization schemes, and dedicated wrapper classes. ### Key Changes #### 1. **Modular Directory Structure** | Before | After | |--------|-------| | Flat file structure with mixed responsibilities | Organized into `methods/` subpackage for schemes | | Single `quant_config.py` (600+ lines) | Separate config files: `modelslim_config.py`, `compressed_tensors_config.py` | | `utils.py` with scheme lookup logic | `methods/registry.py` with decorator-based registration | #### 2. **Registry-Based Scheme Discovery** Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a decorator-based registry pattern: ```python # Before: Manual dictionary mapping ASCEND_QUANTIZATION_METHOD_MAP = { "W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...}, ... } # After: Decorator-based registration @register_scheme("W8A8_DYNAMIC", "linear") class AscendW8A8DynamicLinearMethod(AscendLinearScheme): ... ``` #### 3. **Abstract Base Classes** Introduced three abstract base classes in `methods/base.py`: - `AscendLinearScheme` - Base for linear layer quantization - `AscendMoEScheme` - Base for MoE layer quantization - `AscendAttentionScheme` - Base for attention layer quantization #### 4. **Separated Config and Wrapper Classes** - **Config classes** (`AscendModelSlimConfig`, `AscendCompressedTensorsConfig`): Handle config parsing and scheme selection - **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`, etc.): Implement vLLM interfaces and delegate to schemes #### 5. **Cleaner Public API** ```python # New clean module interface from vllm_ascend.quantization import ( AscendModelSlimConfig, AscendCompressedTensorsConfig, ) from vllm_ascend.quantization.methods import get_scheme_class ``` ### Architecture Diagram ```mermaid classDiagram direction TB class QuantizationConfig { <<vLLM Interface>> +get_quant_method() } class AscendModelSlimConfig { +quant_description +get_quant_method() -create_scheme_for_layer() } class AscendCompressedTensorsConfig { +target_scheme_map +get_quant_method() -_get_scheme_from_parts() } class AscendLinearMethod { <<Wrapper>> +quant_method: AscendLinearScheme +create_weights() +apply() } class AscendFusedMoEMethod { <<Wrapper>> +quant_method: AscendMoEScheme +create_weights() +apply() } class AscendLinearScheme { <<Abstract>> +get_weight()* +apply()* +get_pertensor_param() +get_perchannel_param() } class AscendMoEScheme { <<Abstract>> +get_weight()* +get_dynamic_quant_param()* +apply()* } class W8A8DynamicLinear { +get_weight() +apply() } class W8A8DynamicMoE { +get_weight() +apply() } QuantizationConfig <|-- AscendModelSlimConfig QuantizationConfig <|-- AscendCompressedTensorsConfig AscendModelSlimConfig ..> AscendLinearMethod : creates AscendModelSlimConfig ..> AscendFusedMoEMethod : creates AscendCompressedTensorsConfig ..> AscendLinearMethod : creates AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates AscendLinearMethod o-- AscendLinearScheme : delegates to AscendFusedMoEMethod o-- AscendMoEScheme : delegates to AscendLinearScheme <|-- W8A8DynamicLinear AscendMoEScheme <|-- W8A8DynamicMoE ``` ### Scheme Registration Flow ```mermaid sequenceDiagram participant Module as Scheme Module participant Registry as _SCHEME_REGISTRY participant Config as QuantConfig participant Wrapper as Wrapper Class Note over Module: At import time Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear") Registry->>Registry: Store (quant_type, layer_type) -> Class Note over Config: At runtime Config->>Config: Determine quant_type from description Config->>Registry: get_scheme_class(quant_type, layer_type) Registry-->>Config: Return scheme class Config->>Config: scheme = scheme_cls() Config->>Wrapper: Create wrapper with scheme Wrapper-->>Config: Return wrapper instance ``` ### File Changes Summary | Original Files | Refactored Files | |----------------|------------------| | `__init__.py` (empty) | `__init__.py` (exports public API) | | `quant_config.py` | `modelslim_config.py` + `wrappers.py` | | `compressed_tensors/` | `compressed_tensors_config.py` | | `utils.py` | `methods/registry.py` | | `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` | | `w8a8.py` | `methods/w8a8_static.py` | | `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` | | ... | `methods/base.py` (new) | ### Benefits 1. **Extensibility**: Adding new quantization schemes only requires implementing the base class and adding `@register_scheme` decorator 2. **Maintainability**: Clear separation between config parsing, wrapper logic, and scheme implementation 3. **Testability**: Abstract base classes enable easier unit testing and mocking 4. **Discoverability**: Registry pattern makes it easy to list all supported schemes 5. **Reduced Coupling**: Config classes no longer need to know about all scheme implementations ___ - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
config = AscendModelSlimConfig(bad_config)
with self.assertRaises(ValueError):
config.is_layer_skipped_ascend("fused_layer", fused_mapping)
def test_init_with_none_config(self):
config = AscendModelSlimConfig(None)
self.assertEqual(config.quant_description, {})
def test_init_with_default_config(self):
config = AscendModelSlimConfig()
self.assertEqual(config.quant_description, {})
def test_maybe_update_config_already_populated(self):
# When quant_description is already populated, should be a no-op
self.assertTrue(len(self.ascend_config.quant_description) > 0)
self.ascend_config.maybe_update_config("/some/model/path")
# quant_description should remain unchanged
self.assertEqual(self.ascend_config.quant_description,
self.sample_config)
def test_maybe_update_config_loads_from_file(self):
config = AscendModelSlimConfig()
self.assertEqual(config.quant_description, {})
quant_data = {"layer1.weight": "INT8", "layer2.weight": "FLOAT"}
with tempfile.TemporaryDirectory() as tmpdir:
config_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME)
with open(config_path, "w") as f:
json.dump(quant_data, f)
config.maybe_update_config(tmpdir)
self.assertEqual(config.quant_description, quant_data)
def test_maybe_update_config_raises_when_file_missing(self):
config = AscendModelSlimConfig()
with tempfile.TemporaryDirectory() as tmpdir:
with self.assertRaises(ValueError) as ctx:
config.maybe_update_config(tmpdir)
error_msg = str(ctx.exception)
self.assertIn("ModelSlim Quantization Config Not Found", error_msg)
self.assertIn(MODELSLIM_CONFIG_FILENAME, error_msg)
def test_maybe_update_config_raises_with_json_files_listed(self):
config = AscendModelSlimConfig()
with tempfile.TemporaryDirectory() as tmpdir:
# Create a dummy json file that is NOT the config file
dummy_path = os.path.join(tmpdir, "config.json")
with open(dummy_path, "w") as f:
json.dump({"dummy": True}, f)
with self.assertRaises(ValueError) as ctx:
config.maybe_update_config(tmpdir)
error_msg = str(ctx.exception)
self.assertIn("config.json", error_msg)
def test_maybe_update_config_non_directory_raises(self):
config = AscendModelSlimConfig()
with self.assertRaises(ValueError) as ctx:
config.maybe_update_config("not_a_real_directory_path")
error_msg = str(ctx.exception)
self.assertIn("ModelSlim Quantization Config Not Found", error_msg)
def test_apply_extra_quant_adaptations_shared_head(self):
config = AscendModelSlimConfig()
config.quant_description = {
"model.layers.0.shared_head.weight": "INT8",
}
config._apply_extra_quant_adaptations()
self.assertIn("model.layers.0.weight", config.quant_description)
self.assertEqual(config.quant_description["model.layers.0.weight"],
"INT8")
def test_apply_extra_quant_adaptations_weight_packed(self):
config = AscendModelSlimConfig()
config.quant_description = {
"model.layers.0.weight_packed": "INT8",
}
config._apply_extra_quant_adaptations()
self.assertIn("model.layers.0.weight", config.quant_description)
self.assertEqual(config.quant_description["model.layers.0.weight"],
"INT8")
def test_get_scaled_act_names(self):
self.assertEqual(self.ascend_config.get_scaled_act_names(), [])