[Core] Make V1 work and enable V1 engine test (#389)

1. Make sure the version is string before parse in collect_env
2. Add basic V1 engine test

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-03-28 19:34:23 +08:00
committed by GitHub
parent 57a84bb7be
commit 31f29b9f30
8 changed files with 66 additions and 95 deletions

View File

@@ -100,3 +100,4 @@ def test_fused_experts(
e_map)
# TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
torch.npu.empty_cache()

View File

@@ -45,8 +45,6 @@ def test_models(
dtype: str,
max_tokens: int,
) -> None:
os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND"
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,3 +58,8 @@ def test_models(
enforce_eager=False,
gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
if __name__ == "__main__":
import pytest
pytest.main([__file__])