23 lines
666 B
Python
23 lines
666 B
Python
|
|
import pytest
|
||
|
|
|
||
|
|
from tests.e2e.conftest import VllmRunner
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("dtype", ["float16"])
|
||
|
|
@pytest.mark.parametrize("max_tokens", [5])
|
||
|
|
def test_qwen3_w8a8_e2e_310p(dtype: str, max_tokens: int) -> None:
|
||
|
|
example_prompts = [
|
||
|
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
|
||
|
|
]
|
||
|
|
|
||
|
|
with VllmRunner(
|
||
|
|
"vllm-ascend/Qwen3-32B-W8A8",
|
||
|
|
tensor_parallel_size=4,
|
||
|
|
dtype=dtype,
|
||
|
|
max_model_len=8192,
|
||
|
|
enforce_eager=True,
|
||
|
|
quantization="ascend",
|
||
|
|
enable_prefix_caching=False,
|
||
|
|
) as vllm_model:
|
||
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|