xc-llm-ascend/tests/e2e/singlecard/test_quantization.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from modelscope import snapshot_download  # type: ignore[import-untyped]

from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal


def test_qwen3_w8a8_quant():
    max_tokens = 5
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
    ]
    vllm_target_outputs = [([
        85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,
        13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
    ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
                            )]

    with VllmRunner(
            snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
            max_model_len=8192,
            gpu_memory_utilization=0.7,
            cudagraph_capture_sizes=[1, 2, 4, 8],
            quantization="ascend",
    ) as vllm_model:
        vllm_quant_w8a8_outputs = vllm_model.generate_greedy(
            example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_target_outputs,
        outputs_1_lst=vllm_quant_w8a8_outputs,
        name_0="vllm_target_outputs",
        name_1="vllm_w8a16_outputs",
    )


def test_qwen3_dense_w8a16():
    max_tokens = 5
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
    ]
    vllm_target_outputs = [([
        85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,
        13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
    ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
                            )]

    with VllmRunner(
            snapshot_download("vllm-ascend/Qwen3-0.6B-W8A16"),
            max_model_len=8192,
            enforce_eager=False,
            gpu_memory_utilization=0.7,
            quantization="ascend",
    ) as vllm_model:
        vllm_quant_w8a16_outputs = vllm_model.generate_greedy(
            example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_target_outputs,
        outputs_1_lst=vllm_quant_w8a16_outputs,
        name_0="vllm_target_outputs",
        name_1="vllm_w8a16_outputs",
    )
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`from modelscope import snapshot_download # type: ignore[import-untyped]`

			`from tests.e2e.conftest import VllmRunner`
[quantization] Add w8a16 quantization support (#4541) ### What this PR does / why we need it? related to https://github.com/vllm-project/vllm-ascend/issues/4267 ### Does this PR introduce _any_ user-facing change? support w8a16 quantization now ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 ### Test tested using [aisbench](https://gitee.com/aisbench/benchmark/) with tp2 #### Precision \| ceval \| mmlu \| gsm8k -- \| -- \| -- \| -- bf16 \| 90.46 \| 89.17 \| 96.21 w8a16 \| 89.51 \| 89.29 \| 95.98 #### Performance \| input_len \| output_len \| concurrency \| TTFT (ms) \| TPOT (ms) \| TPS (Total) (tokens/s) -- \| -- \| -- \| -- \| -- \| -- \| -- bf16 \| 2048 \| 2048 \| 10 \| 1911.7136 \| 77.988 \| 253.9866 w8a16 \| 2048 \| 2048 \| 10 \| 2128.6334 \| 67.1633 \| 293.9117 bf16 \| 3500 \| 1024 \| 10 \| 3076.2509 \| 84.3525 \| 506.949 w8a16 \| 3500 \| 1024 \| 10 \| 2685.2031 \| 73.015 \| 585.4717 --------- Signed-off-by: yyt <yangyit139@gmail.com> Signed-off-by: TmacAaron <yangyit139@gmail.com> Co-authored-by: realliujiaxu <realliujiaxu@163.com> 2025-12-24 19:49:32 +08:00			`from tests.e2e.model_utils import check_outputs_equal`
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00

[CI] refect e2e ci test (#5246) ### What this PR does / why we need it? efect e2e ci test： 1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager parameter and rename test case 2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases 3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case 4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case 5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases 6. tests/e2e/singlecard/test_sampler.py: Rename test cases 7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases 8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename test cases and remove the eager parameter 9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases and remove the eager parameter 10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases and remove the eager parameter 11.tests/e2e/multicard/test_expert_parallel.py:remove the eager parameter 12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager parameter 13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter 14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove the eager parameter 15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the eager parameter 16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager parameter 17.tests/e2e/singlecard/test_camem.py:remove the eager parameter 18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter 19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove the eager parameter 20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter 21.tests/e2e/singlecard/test_xli:remove the eager parameter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hfadzxy <starmoon_zhang@163.com> 2025-12-23 18:42:35 +08:00			`def test_qwen3_w8a8_quant():`
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`max_tokens = 5`
Refactor e2e CI (#2276) Refactor E2E CI to make it clear and faster 1. remove some uesless e2e test 2. remove some uesless function 3. Make sure all test runs with VLLMRunner to avoid oom error 4. Make sure all ops test end with torch.empty_cache to avoid oom error 5. run the test one by one to avoid resource limit error - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/a344a5aa0a58cc1758d9721e848ce1f5ca4b6c7f Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-02 09:02:22 +08:00			`example_prompts = [`
			`"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."`
			`]`
[quantization] Add w8a16 quantization support (#4541) ### What this PR does / why we need it? related to https://github.com/vllm-project/vllm-ascend/issues/4267 ### Does this PR introduce _any_ user-facing change? support w8a16 quantization now ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 ### Test tested using [aisbench](https://gitee.com/aisbench/benchmark/) with tp2 #### Precision \| ceval \| mmlu \| gsm8k -- \| -- \| -- \| -- bf16 \| 90.46 \| 89.17 \| 96.21 w8a16 \| 89.51 \| 89.29 \| 95.98 #### Performance \| input_len \| output_len \| concurrency \| TTFT (ms) \| TPOT (ms) \| TPS (Total) (tokens/s) -- \| -- \| -- \| -- \| -- \| -- \| -- bf16 \| 2048 \| 2048 \| 10 \| 1911.7136 \| 77.988 \| 253.9866 w8a16 \| 2048 \| 2048 \| 10 \| 2128.6334 \| 67.1633 \| 293.9117 bf16 \| 3500 \| 1024 \| 10 \| 3076.2509 \| 84.3525 \| 506.949 w8a16 \| 3500 \| 1024 \| 10 \| 2685.2031 \| 73.015 \| 585.4717 --------- Signed-off-by: yyt <yangyit139@gmail.com> Signed-off-by: TmacAaron <yangyit139@gmail.com> Co-authored-by: realliujiaxu <realliujiaxu@163.com> 2025-12-24 19:49:32 +08:00			`vllm_target_outputs = [([`
			`85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,`
			`13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387`
			`], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'`
			`)]`

[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`with VllmRunner(`
[CI] refect e2e ci test (#5246) ### What this PR does / why we need it? efect e2e ci test： 1. tests/e2e/singlecard/pooling/test_embedding.py: remove the eager parameter and rename test case 2. tests/e2e/singlecard/pooling/test_scoring.py: Rename test cases 3. tests/e2e/singlecard/pooling/test_classification.py: Rename test case 4. tests/e2e/singlecard/test_quantization.py: remove the eager parameter and chage model to vllm-ascend/Qwen2.5-0.6B-W8A8 and Rename test case 5. tests/e2e/multicard/test_shared_expert_dp.py: Rename test cases 6. tests/e2e/singlecard/test_sampler.py: Rename test cases 7. tests/e2e/singlecard/test_aclgraph_accuracy.py: Rename test cases 8. tests/e2e/multicard/test_offline_inference_distributed.py: Rename test cases and remove the eager parameter 9. tests/e2e/multicard/long_sequence/test_accuracy.py: Rename test cases and remove the eager parameter 10. tests/e2e/multicard/long_sequence/test_basic.py: Rename test cases and remove the eager parameter 11.tests/e2e/multicard/test_expert_parallel.py:remove the eager parameter 12.tests/e2e/multicard/test_full_graph_mode.py:remove the eager parameter 13.tests/e2e/multicard/test_ilama_lora_tp2.py:remove the eager parameter 14.tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py:remove the eager parameter 15.tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py:remove the eager parameter 16.tests/e2e/singlecard/test_aclgraph_accuracy.py:remove the eager parameter 17.tests/e2e/singlecard/test_camem.py:remove the eager parameter 18.tests/e2e/singlecard/test_ilama_lora.py:remove the eager parameter 19.tests/e2e/singlecard/test_multistream_overlap_shared_expert.py:remove the eager parameter 20.tests/e2e/singlecard/test_vlm.py:remove the eager parameter 21.tests/e2e/singlecard/test_xli:remove the eager parameter ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: hfadzxy <starmoon_zhang@163.com> 2025-12-23 18:42:35 +08:00			`snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),`
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`max_model_len=8192,`
			`gpu_memory_utilization=0.7,`
[E2E] Optimize the E2E test time. (#5294) ### What this PR does / why we need it? Add cudagraph_capture_sizes for E2E CI test. - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: menogrey <1299267905@qq.com> 2025-12-26 14:17:50 +08:00			`cudagraph_capture_sizes=[1, 2, 4, 8],`
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-29 18:59:05 +08:00			`quantization="ascend",`
			`) as vllm_model:`
[quantization] Add w8a16 quantization support (#4541) ### What this PR does / why we need it? related to https://github.com/vllm-project/vllm-ascend/issues/4267 ### Does this PR introduce _any_ user-facing change? support w8a16 quantization now ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 ### Test tested using [aisbench](https://gitee.com/aisbench/benchmark/) with tp2 #### Precision \| ceval \| mmlu \| gsm8k -- \| -- \| -- \| -- bf16 \| 90.46 \| 89.17 \| 96.21 w8a16 \| 89.51 \| 89.29 \| 95.98 #### Performance \| input_len \| output_len \| concurrency \| TTFT (ms) \| TPOT (ms) \| TPS (Total) (tokens/s) -- \| -- \| -- \| -- \| -- \| -- \| -- bf16 \| 2048 \| 2048 \| 10 \| 1911.7136 \| 77.988 \| 253.9866 w8a16 \| 2048 \| 2048 \| 10 \| 2128.6334 \| 67.1633 \| 293.9117 bf16 \| 3500 \| 1024 \| 10 \| 3076.2509 \| 84.3525 \| 506.949 w8a16 \| 3500 \| 1024 \| 10 \| 2685.2031 \| 73.015 \| 585.4717 --------- Signed-off-by: yyt <yangyit139@gmail.com> Signed-off-by: TmacAaron <yangyit139@gmail.com> Co-authored-by: realliujiaxu <realliujiaxu@163.com> 2025-12-24 19:49:32 +08:00			`vllm_quant_w8a8_outputs = vllm_model.generate_greedy(`
			`example_prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_target_outputs,`
			`outputs_1_lst=vllm_quant_w8a8_outputs,`
			`name_0="vllm_target_outputs",`
			`name_1="vllm_w8a16_outputs",`
			`)`


			`def test_qwen3_dense_w8a16():`
			`max_tokens = 5`
			`example_prompts = [`
			`"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."`
			`]`
			`vllm_target_outputs = [([`
			`85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323,`
			`13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387`
			`], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'`
			`)]`

			`with VllmRunner(`
			`snapshot_download("vllm-ascend/Qwen3-0.6B-W8A16"),`
			`max_model_len=8192,`
			`enforce_eager=False,`
			`gpu_memory_utilization=0.7,`
			`quantization="ascend",`
			`) as vllm_model:`
			`vllm_quant_w8a16_outputs = vllm_model.generate_greedy(`
			`example_prompts, max_tokens)`

			`check_outputs_equal(`
			`outputs_0_lst=vllm_target_outputs,`
			`outputs_1_lst=vllm_quant_w8a16_outputs,`
			`name_0="vllm_target_outputs",`
			`name_1="vllm_w8a16_outputs",`
			`)`