[main][Feature] Support Qwen3 W4A8 quantization (#2060)
### What this PR does / why we need it?
Adding `W4A8_DYNAMIC` quantization support for linear.
Dense models like Qwen3 can infer with `W4A8_DYNAMIC` quantization.
### Does this PR introduce _any_ user-facing change?
None
### How was this patch tested?
Adding ut case in `tests/ut/quantization/test_w4a8_dynamic.py`
Adding e2e case in
`tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC`
to test qwen3 w4a8_dynamic quantized model
Note the w4a8_dynamic quantized model is quantized by `msit/msmodelslim`
of commit `d0abb0a47e1f1a473b866ad41b737fbc28fb1409`
1. Generate `W4A8_DYNAMIC` quantization weights using `msmodelslim`
```shell
git clone https://gitee.com/ascend/msit.git
cd msit/msmodelslim
git checkout d0abb0a47e1f1a473b866ad41b737fbc28fb1409
bash install.sh
```
2. Serve model using `vllm`
```shell
VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \
--model vllm-ascend/Qwen3-8B-W4A8 \
--port 8000 \
--quantization ascend \
--tensor_parallel_size 2 \
--enforce-eager
```
- vLLM version: v0.10.0
- vLLM main:
4cd7fe6cea
---------
Signed-off-by: ZhouXiang <zhouxiang100@huawei.com>
This commit is contained in:
27
tests/ut/quantization/test_w4a8_dynamic.py
Normal file
27
tests/ut/quantization/test_w4a8_dynamic.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.quantization.w4a8_dynamic import AscendW4A8DynamicLinearMethod
|
||||
|
||||
|
||||
class TestAscendW4A8DynamicLinearMethod(TestBase):
|
||||
|
||||
def setUp(self):
|
||||
self.method = AscendW4A8DynamicLinearMethod()
|
||||
self.method.group_size = 8
|
||||
|
||||
def test_get_weight(self):
|
||||
weight = self.method.get_weight(8, 32, torch.bfloat16)
|
||||
self.assertEqual(weight["weight"].dtype, torch.int8)
|
||||
self.assertEqual(weight["weight"].shape, (32, 8))
|
||||
|
||||
def test_get_pergroup_param(self):
|
||||
params = self.method.get_pergroup_param(8, 32, torch.bfloat16)
|
||||
self.assertEqual(params["weight_scale"].dtype, torch.bfloat16)
|
||||
self.assertEqual(params["weight_scale"].shape, (32, 1))
|
||||
self.assertEqual(params["weight_offset"].dtype, torch.bfloat16)
|
||||
self.assertEqual(params["weight_offset"].shape, (32, 1))
|
||||
self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16)
|
||||
self.assertEqual(params["weight_scale_second"].shape, (32, 1))
|
||||
self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16)
|
||||
self.assertEqual(params["weight_offset_second"].shape, (32, 1))
|
||||
Reference in New Issue
Block a user