xc-llm-ascend/tests/ut/quantization/test_w4a8_dynamic.py

import torch

from tests.ut.base import TestBase
from vllm_ascend.quantization.w4a8_dynamic import AscendW4A8DynamicLinearMethod


class TestAscendW4A8DynamicLinearMethod(TestBase):

    def setUp(self):
        self.method = AscendW4A8DynamicLinearMethod()
        self.method.group_size = 8

    def test_get_weight(self):
        weight = self.method.get_weight(8, 32, torch.bfloat16)
        self.assertEqual(weight["weight"].dtype, torch.int8)
        self.assertEqual(weight["weight"].shape, (32, 8))

    def test_get_pergroup_param(self):
        params = self.method.get_pergroup_param(8, 32, torch.bfloat16)
        self.assertEqual(params["weight_scale"].dtype, torch.bfloat16)
        self.assertEqual(params["weight_scale"].shape, (32, 1))
        self.assertEqual(params["weight_offset"].dtype, torch.bfloat16)
        self.assertEqual(params["weight_offset"].shape, (32, 1))
        self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16)
        self.assertEqual(params["weight_scale_second"].shape, (32, 1))
        self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16)
        self.assertEqual(params["weight_offset_second"].shape, (32, 1))
[main][Feature] Support Qwen3 W4A8 quantization (#2060) ### What this PR does / why we need it? Adding `W4A8_DYNAMIC` quantization support for linear. Dense models like Qwen3 can infer with `W4A8_DYNAMIC` quantization. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? Adding ut case in `tests/ut/quantization/test_w4a8_dynamic.py` Adding e2e case in `tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC` to test qwen3 w4a8_dynamic quantized model Note the w4a8_dynamic quantized model is quantized by `msit/msmodelslim` of commit `d0abb0a47e1f1a473b866ad41b737fbc28fb1409` 1. Generate `W4A8_DYNAMIC` quantization weights using `msmodelslim` ```shell git clone https://gitee.com/ascend/msit.git cd msit/msmodelslim git checkout d0abb0a47e1f1a473b866ad41b737fbc28fb1409 bash install.sh ``` 2. Serve model using `vllm` ```shell VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server \ --model vllm-ascend/Qwen3-8B-W4A8 \ --port 8000 \ --quantization ascend \ --tensor_parallel_size 2 \ --enforce-eager ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4cd7fe6ceaf5ad7d8ac2ba5597cd964c6db7e306 --------- Signed-off-by: ZhouXiang <zhouxiang100@huawei.com> 2025-07-30 14:57:14 +08:00			`import torch`

			`from tests.ut.base import TestBase`
			`from vllm_ascend.quantization.w4a8_dynamic import AscendW4A8DynamicLinearMethod`


			`class TestAscendW4A8DynamicLinearMethod(TestBase):`

			`def setUp(self):`
			`self.method = AscendW4A8DynamicLinearMethod()`
			`self.method.group_size = 8`

			`def test_get_weight(self):`
			`weight = self.method.get_weight(8, 32, torch.bfloat16)`
			`self.assertEqual(weight["weight"].dtype, torch.int8)`
			`self.assertEqual(weight["weight"].shape, (32, 8))`

			`def test_get_pergroup_param(self):`
			`params = self.method.get_pergroup_param(8, 32, torch.bfloat16)`
			`self.assertEqual(params["weight_scale"].dtype, torch.bfloat16)`
			`self.assertEqual(params["weight_scale"].shape, (32, 1))`
			`self.assertEqual(params["weight_offset"].dtype, torch.bfloat16)`
			`self.assertEqual(params["weight_offset"].shape, (32, 1))`
			`self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16)`
			`self.assertEqual(params["weight_scale_second"].shape, (32, 1))`
			`self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16)`
			`self.assertEqual(params["weight_offset_second"].shape, (32, 1))`