[Feature][Quant] Auto-detect quantization format from model files (#6645)
## Summary
- Add automatic quantization format detection, eliminating the need to
manually specify `--quantization` when serving quantized models.
- The detection inspects only lightweight JSON files
(`quant_model_description.json` and `config.json`) at engine
initialization time, with no `.safetensors` reads.
- User-explicit `--quantization` flags are always respected;
auto-detection only applies when the flag is omitted.
## Details
**Detection priority:**
1. `quant_model_description.json` exists → `quantization="ascend"`
(ModelSlim)
2. `config.json` contains `"quant_method": "compressed-tensors"` →
`quantization="compressed-tensors"` (LLM-Compressor)
3. Neither → default float behavior
**Technical approach:**
Hooked into `NPUPlatform.check_and_update_config()` to run detection
after `VllmConfig.__post_init__`. Since `quant_config` is already `None`
at that point, we explicitly recreate it via
`VllmConfig._get_quantization_config()` to trigger the full quantization
initialization pipeline.
## Files Changed
| File | Description |
|------|-------------|
| `vllm_ascend/quantization/utils.py` | Added
`detect_quantization_method()` and `maybe_auto_detect_quantization()` |
| `vllm_ascend/platform.py` | Integrated auto-detection in
`check_and_update_config()` |
| `vllm_ascend/quantization/modelslim_config.py` | Improved error
handling for weight loading |
- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd
---------
Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
@@ -116,13 +116,14 @@ class TestNPUPlatform(TestBase):
|
||||
self.assertIsNone(self.platform.inference_mode())
|
||||
mock_inference_mode.assert_called_once()
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.utils.update_aclgraph_sizes")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("os.environ", {})
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_basic_config_update(
|
||||
self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend
|
||||
self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend, mock_auto_detect
|
||||
):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
@@ -146,11 +147,12 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
mock_init_ascend.assert_called_once_with(vllm_config)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_no_model_config_warning(
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect
|
||||
):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
@@ -172,10 +174,11 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
self.assertTrue("Model config is missing" in cm.output[0])
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, mock_init_ascend, mock_soc_version):
|
||||
def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.enforce_eager = True
|
||||
@@ -206,11 +209,12 @@ class TestNPUPlatform(TestBase):
|
||||
CUDAGraphMode.NONE,
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_unsupported_compilation_level(
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect
|
||||
):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
@@ -244,9 +248,10 @@ class TestNPUPlatform(TestBase):
|
||||
)
|
||||
|
||||
@pytest.mark.skip("Revert me when vllm support setting cudagraph_mode on oot platform")
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_ascend, mock_soc_version):
|
||||
def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_ascend, mock_soc_version, mock_auto_detect):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.model_config.enforce_eager = False
|
||||
@@ -268,11 +273,12 @@ class TestNPUPlatform(TestBase):
|
||||
CUDAGraphMode.NONE,
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_cache_config_block_size(
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect
|
||||
):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
@@ -292,11 +298,12 @@ class TestNPUPlatform(TestBase):
|
||||
|
||||
self.assertEqual(vllm_config.cache_config.block_size, 128)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3)
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_v1_worker_class_selection(
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version
|
||||
self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect
|
||||
):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
@@ -327,10 +334,11 @@ class TestNPUPlatform(TestBase):
|
||||
"vllm_ascend.xlite.xlite_worker.XliteWorker",
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P)
|
||||
@patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config")
|
||||
def test_check_and_update_config_310p_no_custom_ops(self, mock_init_recompute, mock_soc_version, mock_init_ascend):
|
||||
def test_check_and_update_config_310p_no_custom_ops(self, mock_init_recompute, mock_soc_version, mock_init_ascend, mock_auto_detect):
|
||||
mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config()
|
||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
||||
vllm_config.compilation_config.custom_ops = []
|
||||
|
||||
Reference in New Issue
Block a user