[3/N][Refactor][Quantization]remove packed_modules_mapping from models (#3021)

### What this PR does / why we need it?

Some custom models in vllm-ascend define packed_modules_mapping, which
prevent keeping same model class with vllm community. So move these
custom packed_modules_mapping to quant utils.py. After this pr, some
custom models can be removed.

### Does this PR introduce _any_ user-facing change?

tested by CI

### How was this patch tested?

tested by CI

- vLLM version: v0.10.2
- vLLM main:
5089fd749c

Signed-off-by: 22dimensions <waitingwind@foxmail.com>
This commit is contained in:
22dimensions
2025-09-19 20:50:14 +08:00
committed by GitHub
parent 4ba56716f9
commit 0942d9aaab
8 changed files with 76 additions and 80 deletions

View File

@@ -15,41 +15,11 @@
import math
import unittest
import pytest
import torch
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention
class TestCustomQwen3MoeForCausalLM:
def test_class_inheritance(self):
assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM)
@pytest.mark.parametrize("key, expected", [
("qkv_proj", ["q_proj", "k_proj", "v_proj"]),
("gate_up_proj", ["gate_proj", "up_proj"]),
("experts",
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]),
])
def test_packed_modules_mapping(self, key, expected):
assert CustomQwen3MoeForCausalLM.packed_modules_mapping[
key] == expected
def test_packed_modules_mapping_structure(self):
expected_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
"experts": [
"experts.0.gate_proj", "experts.0.up_proj",
"experts.0.down_proj"
]
}
assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping
class DummyRMSNorm:
def __init__(self, dim: int, eps: float = 1e-6):

View File

@@ -73,9 +73,12 @@ class TestAscendQuantConfig(TestBase):
self.assertIsNone(result)
def test_get_quant_method_for_linear(self):
mock_config = MagicMock()
mock_config.model_config.hf_config.model_type = None
linear_layer = MagicMock(spec=LinearBase)
# Test skipped layer
with patch.object(self.ascend_config,
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch.object(self.ascend_config, \
'is_layer_skipped_ascend',
return_value=True):
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
@@ -83,6 +86,7 @@ class TestAscendQuantConfig(TestBase):
# Test quantized layer
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear:
method = self.ascend_config.get_quant_method(linear_layer, ".attn")
@@ -93,14 +97,18 @@ class TestAscendQuantConfig(TestBase):
def test_get_quant_method_for_attention(self):
attention_layer = MagicMock(spec=Attention)
with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
mock_config = MagicMock()
mock_config.model_config.hf_config.model_type = None
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
return_value=MagicMock()) as mock_ascend_kvcache:
# Test with fa_quant_type
method = self.ascend_config.get_quant_method(
attention_layer, ".attn")
self.assertIs(method, mock_ascend_kvcache.return_value)
with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod',
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
return_value=MagicMock()) as mock_ascend_kvcache:
# Test with kv_quant_type
modified_config = {"kv_quant_type": "C8"}
@@ -113,9 +121,12 @@ class TestAscendQuantConfig(TestBase):
fused_moe_layer = MagicMock(spec=FusedMoE)
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
mock_config = MagicMock()
mock_config.model_config.hf_config.model_type = None
# Test skipped layer
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
method = self.ascend_config.get_quant_method(
fused_moe_layer, "moe_layer")
@@ -123,6 +134,7 @@ class TestAscendQuantConfig(TestBase):
# Test quantized layer
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \
patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe:
method = self.ascend_config.get_quant_method(
fused_moe_layer, "moe_layer")