### What this PR does / why we need it?
Supports Deepseek-R1 w4a8 quantization.
Since R1 w4a8 uses mixed quantization, only the MOE layer uses
w4a8_dynamic quantization, so we added the w4a8_dynamic.py file, which
includes the AscendW4A8DynamicFusedMoEMethod class.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
Adding ut case in `tests/ut/quantization/test_w4a8_dynamic.py` and
`tests/ut/quantization/test_quantizer.py`
Adding e2e case in
`tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC`
to test deepseek w4a8_dynamic quantized model
#### 1.How to get weights using Modelslim
##### Installation steps
Use the branch master, the commit id is:
298e175d69b3b855111a1e09bbe2fcd12fdb4e24
git clone https://gitee.com/ascend/msit.git
cd msit/msmodelslim
bash install.sh
##### The required transformers environment
transformers>=4.48.2
##### Generate w4a8 weights
cd /example/DeepSeek
Command reference: msmodelslim/example/DeepSeek/README.md Execute the
[pre-check](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#%E8%BF%90%E8%A1%8C%E5%89%8D%E5%BF%85%E6%A3%80)
and [DeepSeek-R1 w4a8 mix
quantization](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#deepseek-r1-w4a8-%E6%B7%B7%E5%90%88%E9%87%8F%E5%8C%96%E5%89%8D%E4%B8%89%E5%B1%82-mlpw8a8-dynamic-%E9%87%8F%E5%8C%96mla%E5%85%B1%E4%BA%AB%E4%B8%93%E5%AE%B6w8a8%E9%87%8F%E5%8C%96%E8%B7%AF%E7%94%B1%E4%B8%93%E5%AE%B6w4a8-dynamic%E9%87%8F%E5%8C%96)
chapter
Reference command:python3 quant_deepseek_w4a8.py --model_path {Original
weight path} --save_path {Generate weight path} --mindie_format
##### Adapt to vllm-ascend
Since mindie_format generates mindie format, some adaptation
modifications are needed for vllm-ascend to use it:
`quant_model_description_w8a8_dynamic.json` rename to
`quant_model_description.json`, and add `"group_size": 256`
Modification in `config.json`:`"model_type":deepseekv2` is changed to
`"model_type":deepseek_v3`; `quantization_config` is removed;
tips:The group_size and weights match. If the w4a8 weights are not
generated using msmodelslim, you can check the group_size in
quantization_config in config.json.
#### 2.How to run w4a8
##### a.How to run eager mode
export VLLM_USE_V1=1 # v1
python -m vllm.entrypoints.openai.api_server --model=$1
--trust-remote-code -tp $2 -dp $3 --enable_expert_parallel
--quantization ascend --port $4 --max-model-len $5 --max-num-seqs $6
--enforce-eager
eg: python -m vllm.entrypoints.openai.api_server
--model=/weightpath/w4a8_4_layer --trust-remote-code -tp 4 -dp 4
--enable_expert_parallel --quantization ascend --port 8002
--max-model-len 5120 --max-num-seqs 128 --enforce-eager
##### b.How to run graph mode
export VLLM_USE_V1=1 # v1
export HCCL_BUFFSIZE=1024
python -m vllm.entrypoints.openai.api_server --model=$1
--trust-remote-code -tp $2 -dp $3 --enable_expert_parallel
--quantization ascend --port $4 --max-model-len $5
--additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
eg: python -m vllm.entrypoints.openai.api_server
--model=/weight/dsr1_w4a8_vllm --trust-remote-code -tp 4 -dp 4
--enable_expert_parallel --quantization ascend --port 8002
--max-model-len 5120
--additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
- vLLM version: v0.10.0
- vLLM main:
c494f96fbc
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
146 lines
5.7 KiB
Python
146 lines
5.7 KiB
Python
from unittest.mock import MagicMock, patch
|
|
|
|
from tests.ut.base import TestBase
|
|
from vllm_ascend.quantization.quant_config import AscendQuantConfig
|
|
from vllm_ascend.quantization.quantizer import (VLLMAscendQuantizer,
|
|
W4A8DYNAMICQuantizer,
|
|
W8A8Quantizer)
|
|
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE = {"test": "1"}
|
|
|
|
|
|
class TestGetQuantizer(TestBase):
|
|
|
|
def setUp(self):
|
|
# Setup common test fixtures
|
|
self.supported_types = {
|
|
'INT8': MagicMock(_instance=None),
|
|
'FP16': MagicMock(_instance=None),
|
|
'C8': MagicMock(_instance=None)
|
|
}
|
|
self.original_supported_types = SUPPORT_ASCEND_QUANTIZER_TYPE.copy()
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE.update(self.supported_types)
|
|
self.mock_quant_config = MagicMock(spec=AscendQuantConfig)
|
|
self.mock_quant_config.quant_description = {"some_config": "value"}
|
|
|
|
def tearDown(self):
|
|
# Restore original supported types
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE.clear()
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE.update(self.original_supported_types)
|
|
|
|
def test_get_quantizer_fa(self):
|
|
"""Test successful quantizer retrieval for different cases."""
|
|
# Setup
|
|
quant_description = {'fa_quant_type': 'C8'}
|
|
prefix = '.attn'
|
|
expected_type = 'C8'
|
|
with patch.dict(
|
|
'vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE',
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE):
|
|
|
|
result = VLLMAscendQuantizer.get_quantizer(
|
|
quant_description,
|
|
prefix,
|
|
packed_modules_mapping={"some": "mapping"})
|
|
|
|
# Verify
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(result,
|
|
self.supported_types[expected_type]._instance)
|
|
self.supported_types[expected_type].assert_called_once_with(
|
|
quant_description)
|
|
|
|
def test_get_quantizer_kv(self):
|
|
"""Test successful quantizer retrieval for different cases."""
|
|
# Setup
|
|
quant_description = {'kv_quant_type': 'C8'}
|
|
prefix = '.attn'
|
|
expected_type = 'C8'
|
|
with patch.dict(
|
|
'vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE',
|
|
SUPPORT_ASCEND_QUANTIZER_TYPE):
|
|
|
|
result = VLLMAscendQuantizer.get_quantizer(
|
|
quant_description,
|
|
prefix,
|
|
packed_modules_mapping={"some": "mapping"})
|
|
|
|
# Verify
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(result,
|
|
self.supported_types[expected_type]._instance)
|
|
self.supported_types[expected_type].assert_called_once_with(
|
|
quant_description)
|
|
|
|
def test_get_quantizer_linear(self):
|
|
"""Test successful quantizer retrieval for different cases."""
|
|
# Setup
|
|
quant_description = {'linear_type': 'INT8'}
|
|
prefix = 'nothing'
|
|
expected_type = 'INT8'
|
|
with patch('vllm_ascend.quantization.quantizer.VLLMAscendQuantizer.get_linear_quant_type',
|
|
return_value=expected_type), \
|
|
patch.dict('vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE', SUPPORT_ASCEND_QUANTIZER_TYPE):
|
|
|
|
result = VLLMAscendQuantizer.get_quantizer(
|
|
quant_description,
|
|
prefix,
|
|
packed_modules_mapping={"some": "mapping"})
|
|
|
|
# Verify
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(result,
|
|
self.supported_types[expected_type]._instance)
|
|
self.supported_types[expected_type].assert_called_once_with(
|
|
quant_description)
|
|
|
|
|
|
class TestW8A8Quantizer(TestBase):
|
|
|
|
def setUp(self):
|
|
self.quantizer = W8A8Quantizer(quant_description={})
|
|
|
|
def test_build_linear_method(self):
|
|
with patch('vllm_ascend.quantization.quantizer.AscendW8A8LinearMethod',
|
|
return_value=MagicMock()) as mock_linear:
|
|
result = self.quantizer.build_linear_method()
|
|
mock_linear.assert_called_once_with()
|
|
self.assertIsInstance(result, MagicMock)
|
|
|
|
def test_build_moe_method(self):
|
|
with patch(
|
|
'vllm_ascend.quantization.quantizer.AscendW8A8FusedMoEMethod',
|
|
return_value=MagicMock()) as mock_linear:
|
|
result = self.quantizer.build_moe_method()
|
|
mock_linear.assert_called_once_with()
|
|
self.assertIsInstance(result, MagicMock)
|
|
|
|
def test_build_attention_method(self):
|
|
with patch('vllm_ascend.quantization.quantizer.AscendC8KVCacheMethod',
|
|
return_value=MagicMock()) as mock_linear:
|
|
result = self.quantizer.build_attention_method()
|
|
mock_linear.assert_called_once_with()
|
|
self.assertIsInstance(result, MagicMock)
|
|
|
|
|
|
class TestW4A8DYNAMICQuantizer(TestBase):
|
|
|
|
def setUp(self):
|
|
self.quantizer = W4A8DYNAMICQuantizer(quant_description={})
|
|
|
|
def test_build_linear_method(self):
|
|
with patch(
|
|
'vllm_ascend.quantization.quantizer.AscendW4A8DynamicLinearMethod',
|
|
return_value=MagicMock()) as mock_linear:
|
|
result = self.quantizer.build_linear_method()
|
|
mock_linear.assert_called_once_with()
|
|
self.assertIsInstance(result, MagicMock)
|
|
|
|
def test_build_moe_method(self):
|
|
with patch(
|
|
'vllm_ascend.quantization.quantizer.AscendW4A8DynamicFusedMoEMethod',
|
|
return_value=MagicMock()) as mock_fused_moe:
|
|
result = self.quantizer.build_moe_method()
|
|
mock_fused_moe.assert_called_once_with()
|
|
self.assertIsInstance(result, MagicMock)
|