[Core] Make V1 work and enable V1 engine test (#389)

1. Make sure the version is string before parse in collect_env 2. Add basic V1 engine test Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-03-28 19:34:23 +08:00
parent 57a84bb7be
commit 31f29b9f30
8 changed files with 66 additions and 95 deletions
--- a/tests/ops/test_fused_moe.py
+++ b/tests/ops/test_fused_moe.py
@@ -100,3 +100,4 @@ def test_fused_experts(
                                 e_map)
        # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
        torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
+    torch.npu.empty_cache()
--- a/tests/test_offline_inference.py
+++ b/tests/test_offline_inference.py
@@ -45,8 +45,6 @@ def test_models(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND"
-
    # 5042 tokens for gemma2
    # gemma2 has alternating sliding window size of 4096
    # we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,3 +58,8 @@ def test_models(
                    enforce_eager=False,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])