Files
xc-llm-ascend/tests/e2e/310p/test_offline_inference_w8a8_310p.py

23 lines
666 B
Python
Raw Normal View History

import pytest
from tests.e2e.conftest import VllmRunner
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [5])
def test_qwen3_w8a8_e2e_310p(dtype: str, max_tokens: int) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
]
with VllmRunner(
"vllm-ascend/Qwen3-32B-W8A8",
tensor_parallel_size=4,
dtype=dtype,
max_model_len=8192,
enforce_eager=True,
quantization="ascend",
enable_prefix_caching=False,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)