[3/N][Refactor][Quantization]remove packed_modules_mapping from models (#3021)

### What this PR does / why we need it? Some custom models in vllm-ascend define packed_modules_mapping, which prevent keeping same model class with vllm community. So move these custom packed_modules_mapping to quant utils.py. After this pr, some custom models can be removed. ### Does this PR introduce _any_ user-facing change? tested by CI ### How was this patch tested? tested by CI - vLLM version: v0.10.2 - vLLM main: 5089fd749c Signed-off-by: 22dimensions <waitingwind@foxmail.com>
2025-09-19 20:50:14 +08:00
parent 4ba56716f9
commit 0942d9aaab
8 changed files with 76 additions and 80 deletions
--- a/tests/ut/quantization/test_quant_config.py
+++ b/tests/ut/quantization/test_quant_config.py
@@ -73,9 +73,12 @@ class TestAscendQuantConfig(TestBase):
        self.assertIsNone(result)

    def test_get_quant_method_for_linear(self):
+        mock_config = MagicMock()
+        mock_config.model_config.hf_config.model_type = None
        linear_layer = MagicMock(spec=LinearBase)
        # Test skipped layer
-        with patch.object(self.ascend_config,
+        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
+            patch.object(self.ascend_config, \
                          'is_layer_skipped_ascend',
                          return_value=True):
            method = self.ascend_config.get_quant_method(linear_layer, ".attn")
@@ -83,6 +86,7 @@ class TestAscendQuantConfig(TestBase):

        # Test quantized layer
        with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
+            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
            patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:

            method = self.ascend_config.get_quant_method(linear_layer, ".attn")
@@ -93,14 +97,18 @@ class TestAscendQuantConfig(TestBase):

    def test_get_quant_method_for_attention(self):
        attention_layer = MagicMock(spec=Attention)
-        with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
+        mock_config = MagicMock()
+        mock_config.model_config.hf_config.model_type = None
+        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
+            patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
                   return_value=MagicMock()) as mock_ascend_kvcache:
            # Test with fa_quant_type
            method = self.ascend_config.get_quant_method(
                attention_layer, ".attn")
            self.assertIs(method, mock_ascend_kvcache.return_value)

-        with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
+        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
+            patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
                   return_value=MagicMock()) as mock_ascend_kvcache:
            # Test with kv_quant_type
            modified_config = {"kv_quant_type": "C8"}
@@ -113,9 +121,12 @@ class TestAscendQuantConfig(TestBase):
        fused_moe_layer = MagicMock(spec=FusedMoE)
        fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
        fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
+        mock_config = MagicMock()
+        mock_config.model_config.hf_config.model_type = None

        # Test skipped layer
        with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
+            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
            patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
            method = self.ascend_config.get_quant_method(
                fused_moe_layer, "moe_layer")
@@ -123,6 +134,7 @@ class TestAscendQuantConfig(TestBase):

        # Test quantized layer
        with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
+            patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
            patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
            method = self.ascend_config.get_quant_method(
                fused_moe_layer, "moe_layer")