From 0942d9aaabb6344b542f43147add9077e9c49c3c Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Fri, 19 Sep 2025 20:50:14 +0800 Subject: [PATCH] [3/N][Refactor][Quantization]remove packed_modules_mapping from models (#3021) ### What this PR does / why we need it? Some custom models in vllm-ascend define packed_modules_mapping, which prevent keeping same model class with vllm community. So move these custom packed_modules_mapping to quant utils.py. After this pr, some custom models can be removed. ### Does this PR introduce _any_ user-facing change? tested by CI ### How was this patch tested? tested by CI - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5089fd749cbe4233a29f29ce706d56c47464c117 Signed-off-by: 22dimensions --- tests/ut/models/test_qwen3_moe.py | 30 ----------- tests/ut/quantization/test_quant_config.py | 18 +++++-- vllm_ascend/models/deepseek_mtp.py | 8 --- vllm_ascend/models/deepseek_v2.py | 6 --- vllm_ascend/models/qwen2_5_vl.py | 11 ---- vllm_ascend/models/qwen3_moe.py | 13 ----- vllm_ascend/models/qwen3_next.py | 9 ---- vllm_ascend/quantization/quant_config.py | 61 ++++++++++++++++++++++ 8 files changed, 76 insertions(+), 80 deletions(-) diff --git a/tests/ut/models/test_qwen3_moe.py b/tests/ut/models/test_qwen3_moe.py index e882fe2..858b106 100644 --- a/tests/ut/models/test_qwen3_moe.py +++ b/tests/ut/models/test_qwen3_moe.py @@ -15,41 +15,11 @@ import math import unittest -import pytest import torch -from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM -from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention -class TestCustomQwen3MoeForCausalLM: - - def test_class_inheritance(self): - assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM) - - @pytest.mark.parametrize("key, expected", [ - ("qkv_proj", ["q_proj", "k_proj", "v_proj"]), - ("gate_up_proj", ["gate_proj", "up_proj"]), - ("experts", - ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]), - ]) - def test_packed_modules_mapping(self, key, expected): - assert CustomQwen3MoeForCausalLM.packed_modules_mapping[ - key] == expected - - def test_packed_modules_mapping_structure(self): - expected_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"], - "experts": [ - "experts.0.gate_proj", "experts.0.up_proj", - "experts.0.down_proj" - ] - } - assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping - - class DummyRMSNorm: def __init__(self, dim: int, eps: float = 1e-6): diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py index fa5d13e..5a119b4 100644 --- a/tests/ut/quantization/test_quant_config.py +++ b/tests/ut/quantization/test_quant_config.py @@ -73,9 +73,12 @@ class TestAscendQuantConfig(TestBase): self.assertIsNone(result) def test_get_quant_method_for_linear(self): + mock_config = MagicMock() + mock_config.model_config.hf_config.model_type = None linear_layer = MagicMock(spec=LinearBase) # Test skipped layer - with patch.object(self.ascend_config, + with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ + patch.object(self.ascend_config, \ 'is_layer_skipped_ascend', return_value=True): method = self.ascend_config.get_quant_method(linear_layer, ".attn") @@ -83,6 +86,7 @@ class TestAscendQuantConfig(TestBase): # Test quantized layer with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \ + patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear: method = self.ascend_config.get_quant_method(linear_layer, ".attn") @@ -93,14 +97,18 @@ class TestAscendQuantConfig(TestBase): def test_get_quant_method_for_attention(self): attention_layer = MagicMock(spec=Attention) - with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', + mock_config = MagicMock() + mock_config.model_config.hf_config.model_type = None + with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ + patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \ return_value=MagicMock()) as mock_ascend_kvcache: # Test with fa_quant_type method = self.ascend_config.get_quant_method( attention_layer, ".attn") self.assertIs(method, mock_ascend_kvcache.return_value) - with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', + with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ + patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \ return_value=MagicMock()) as mock_ascend_kvcache: # Test with kv_quant_type modified_config = {"kv_quant_type": "C8"} @@ -113,9 +121,12 @@ class TestAscendQuantConfig(TestBase): fused_moe_layer = MagicMock(spec=FusedMoE) fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig) fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig) + mock_config = MagicMock() + mock_config.model_config.hf_config.model_type = None # Test skipped layer with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \ + patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe: method = self.ascend_config.get_quant_method( fused_moe_layer, "moe_layer") @@ -123,6 +134,7 @@ class TestAscendQuantConfig(TestBase): # Test quantized layer with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \ + patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe: method = self.ascend_config.get_quant_method( fused_moe_layer, "moe_layer") diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py index e9c2eaa..80bc66e 100644 --- a/vllm_ascend/models/deepseek_mtp.py +++ b/vllm_ascend/models/deepseek_mtp.py @@ -180,14 +180,6 @@ class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor): class CustomDeepSeekMTP(DeepSeekMTP): - # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; - # NOTE 2.The description file generated by the current msmodelslim tool does not have - # MTP layer info. Please manually add it and set the value to FLOAT. - packed_modules_mapping = { - "gate_up_proj": ["gate_proj", "up_proj"], - "experts": - ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 7d78a0b..33e5145 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -320,12 +320,6 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer): class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): - # add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging - packed_modules_mapping = { - "gate_up_proj": ["gate_proj", "up_proj"], - "experts": - ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index b15946a..4497104 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -491,17 +491,6 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): dummy_inputs=Qwen2_5_VLDummyInputsBuilder) class AscendQwen2_5_VLForConditionalGeneration( Qwen2_5_VLForConditionalGeneration): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index 4ee41eb..7ad54a2 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -318,19 +318,6 @@ class CustomQwen3MoeModel(Qwen3MoeModel): class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - "experts": - ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index a94e72d..6234c83 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -1166,15 +1166,6 @@ class Qwen3NextModel(nn.Module): class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, MixtureOfExperts, IsHybrid): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": ["gate_proj", "up_proj"], - "in_proj": ["in_proj_qkvz", "in_proj_ba"], - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 6124fcb..b89fd08 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -19,6 +19,7 @@ from types import MappingProxyType from typing import Any, Callable, Dict, List, Mapping, Optional import torch +from vllm.config import get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) @@ -89,6 +90,11 @@ class AscendQuantConfig(QuantizationConfig): def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: + vllm_config = get_current_vllm_config() + model_type = vllm_config.model_config.hf_config.model_type + if model_type in packed_modules_model_mapping: + self.packed_modules_mapping = packed_modules_model_mapping[ + model_type] from vllm.attention.layer import Attention if prefix.startswith("language_model"): prefix = prefix.split('.', 1)[-1] @@ -153,6 +159,61 @@ class AscendQuantConfig(QuantizationConfig): return [] +packed_modules_model_mapping = { + "qwen3_moe": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, + "deepseek_v2": { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + }, + "deepseek_v3": { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + }, + # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; + # NOTE 2.The description file generated by the current msmodelslim tool does not have + # MTP layer info. Please manually add it and set the value to FLOAT. + "deepseek_mtp": { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + }, + "qwen3_next": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["gate_proj", "up_proj"], + "in_proj": ["in_proj_qkvz", "in_proj_ba"], + }, + "qwen2_5_vl": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } +} + + class AscendLinearMethod(LinearMethodBase): """Linear method for Ascend quantization.