xc-llm-ascend/tests/e2e/310p/test_offline_inference_w8a8_310p.py

import pytest

from tests.e2e.conftest import VllmRunner


@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [5])
def test_qwen3_w8a8_e2e_310p(dtype: str, max_tokens: int) -> None:
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
    ]

    with VllmRunner(
        "vllm-ascend/Qwen3-32B-W8A8",
        tensor_parallel_size=4,
        dtype=dtype,
        max_model_len=8192,
        enforce_eager=True,
        quantization="ascend",
        enable_prefix_caching=False,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)