[Test] Remove VLLM_USE_V1 in example and tests (#1733)

V1 is enabled by default, no need to set it by hand now. This PR remove the useless setting in example and tests - vLLM version: v0.9.2 - vLLM main: 9ad0a4588b Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-15 12:49:57 +08:00
parent eb921d2b6f
commit 787010a637
29 changed files with 186 additions and 291 deletions
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import os

 import pytest
 import torch
 from vllm import LLM

-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 PROMPT = "Hello my name is Robert and I"

--- a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -9,8 +9,8 @@ Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
 """
 import pytest

-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal

 MODELS = [
    "Qwen/Qwen3-0.6B-Base",
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -53,7 +53,6 @@ def model_name():
@pytest.mark.skipif(
    True, reason="TODO: Enable me after test_mtp_correctness is fixed")
 def test_mtp_correctness(
-    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
    model_name: str,
@@ -62,33 +61,30 @@ def test_mtp_correctness(
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using mtp speculative decoding.
    '''
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm

-        ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    spec_llm = LLM(model=model_name,
+                   trust_remote_code=True,
+                   speculative_config={
+                       "method": "deepseek_mtp",
+                       "num_speculative_tokens": 1,
+                   },
+                   max_model_len=256,
+                   enforce_eager=True)
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")

-        spec_llm = LLM(model=model_name,
-                       trust_remote_code=True,
-                       speculative_config={
-                           "method": "deepseek_mtp",
-                           "num_speculative_tokens": 1,
-                       },
-                       max_model_len=256,
-                       enforce_eager=True)
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+    del spec_llm
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -60,7 +60,6 @@ def eagle3_model_name():


 def test_ngram_correctness(
-    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
    model_name: str,
@@ -70,44 +69,40 @@ def test_ngram_correctness(
    should be the same when using ngram speculative decoding.
    '''
    pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm

-        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    spec_llm = LLM(
+        model=model_name,
+        speculative_config={
+            "method": "ngram",
+            "prompt_lookup_max": 5,
+            "prompt_lookup_min": 3,
+            "num_speculative_tokens": 3,
+        },
+        max_model_len=1024,
+        enforce_eager=True,
+    )
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")

-        spec_llm = LLM(
-            model=model_name,
-            speculative_config={
-                "method": "ngram",
-                "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
-                "num_speculative_tokens": 3,
-            },
-            max_model_len=1024,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
-
-        # Heuristic: expect at least 70% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 70% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.7 * len(ref_outputs))
+    del spec_llm


@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
-    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
    model_name: str,
@@ -119,43 +114,40 @@ def test_eagle_correctness(
    '''
    if not use_eagle3:
        pytest.skip("Not current support for the test.")
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")

-        ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
-        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
-        del ref_llm
+    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm

-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
-        spec_llm = LLM(
-            model=model_name,
-            trust_remote_code=True,
-            enable_chunked_prefill=True,
-            max_num_seqs=1,
-            max_num_batched_tokens=2048,
-            gpu_memory_utilization=0.6,
-            speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
-                "model": spec_model_name,
-                "num_speculative_tokens": 2,
-                "max_model_len": 128,
-            },
-            max_model_len=128,
-            enforce_eager=True,
-        )
-        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
-        matches = 0
-        misses = 0
-        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            if ref_output.outputs[0].text == spec_output.outputs[0].text:
-                matches += 1
-            else:
-                misses += 1
-                print(f"ref_output: {ref_output.outputs[0].text}")
-                print(f"spec_output: {spec_output.outputs[0].text}")
+    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
+    spec_llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+        max_num_seqs=1,
+        max_num_batched_tokens=2048,
+        gpu_memory_utilization=0.6,
+        speculative_config={
+            "method": "eagle3" if use_eagle3 else "eagle",
+            "model": spec_model_name,
+            "num_speculative_tokens": 2,
+            "max_model_len": 128,
+        },
+        max_model_len=128,
+        enforce_eager=True,
+    )
+    spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")

-        # Heuristic: expect at least 66% of the prompts to match exactly
-        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
-        del spec_llm
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
+    del spec_llm
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -20,14 +20,12 @@ Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """

-import os
-
 import pytest
 import torch
 from vllm import LLM, SamplingParams

-from tests.conftest import VllmRunner
-from tests.model_utils import check_outputs_equal
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal

 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",
@@ -36,37 +34,29 @@ MODELS = [
 ]


-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
 def test_models(
    model: str,
    max_tokens: int,
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    with monkeypatch.context() as m:
-        prompts = [
-            "Hello, my name is", "The president of the United States is",
-            "The capital of France is", "The future of AI is"
-        ]
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]

-        # aclgraph only support on v1
-        m.setenv("VLLM_USE_V1", "1")
+    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+    # TODO: change to use vllmrunner when the registry of custom op is solved
+    # while running pytest
+    vllm_model = LLM(model)
+    vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
+    del vllm_model
+    torch.npu.empty_cache()

-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         temperature=0.0)
-        # TODO: change to use vllmrunner when the registry of custom op is solved
-        # while running pytest
-        vllm_model = LLM(model)
-        vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
-        del vllm_model
-        torch.npu.empty_cache()
-
-        vllm_model = LLM(model, enforce_eager=True)
-        vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model, enforce_eager=True)
+    vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
+    del vllm_model
+    torch.npu.empty_cache()

    vllm_aclgraph_outputs_list = []
    for output in vllm_aclgraph_outputs:
@@ -86,12 +76,9 @@ def test_models(
    )


-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="aclgraph only support on v1")
 def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_MODELSCOPE", "True")
-        m.setenv("VLLM_USE_V1", "1")
        with pytest.raises(NotImplementedError) as excinfo:
            VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
                       max_model_len=1024,
--- a/tests/e2e/singlecard/test_camem.py
+++ b/tests/e2e/singlecard/test_camem.py
@@ -21,7 +21,7 @@ import torch
 from vllm import LLM, SamplingParams
 from vllm.utils import GiB_bytes

-from tests.utils import fork_new_process_for_each_test
+from tests.e2e.utils import fork_new_process_for_each_test
 from vllm_ascend.device_allocator.camem import CaMemAllocator


--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -20,8 +20,6 @@ Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """

-import os
-
 import pytest
 import torch
 from vllm import LLM, SamplingParams
@@ -29,8 +27,6 @@ from vllm import LLM, SamplingParams
 MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]


-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
-                    reason="new chunked only support on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
 def test_models(
@@ -39,36 +35,33 @@ def test_models(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    return
-    with monkeypatch.context() as m:
-        prompts = "The president of the United States is"

-        m.setenv("VLLM_USE_V1", "1")
+    prompts = "The president of the United States is"

-        sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            temperature=0.0,
-        )
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )

-        vllm_model = LLM(model,
-                         long_prefill_token_threshold=4,
-                         enforce_eager=True)
-        output_chunked = vllm_model.generate(prompts, sampling_params)
-        logprobs_chunked = output_chunked.outputs[0].logprobs
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True)
+    output_chunked = vllm_model.generate(prompts, sampling_params)
+    logprobs_chunked = output_chunked.outputs[0].logprobs
+    del vllm_model
+    torch.npu.empty_cache()

-        vllm_model = LLM(model,
-                         enforce_eager=True,
-                         additional_config={
-                             'ascend_scheduler_config': {
-                                 'enabled': True
-                             },
-                         })
-        output = vllm_model.generate(prompts, sampling_params)
-        logprobs = output.outputs[0].logprobs
-        del vllm_model
-        torch.npu.empty_cache()
+    vllm_model = LLM(model,
+                     enforce_eager=True,
+                     additional_config={
+                         'ascend_scheduler_config': {
+                             'enabled': True
+                         },
+                     })
+    output = vllm_model.generate(prompts, sampling_params)
+    logprobs = output.outputs[0].logprobs
+    del vllm_model
+    torch.npu.empty_cache()

-        logprobs_similarity = torch.cosine_similarity(
-            logprobs_chunked.flatten(), logprobs.flatten(), dim=0)
-        assert logprobs_similarity > 0.95
+    logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(),
+                                                  logprobs.flatten(),
+                                                  dim=0)
+    assert logprobs_similarity > 0.95
--- a/tests/e2e/singlecard/test_embedding.py
+++ b/tests/e2e/singlecard/test_embedding.py
@@ -21,8 +21,8 @@ from typing import Optional

 from modelscope import snapshot_download  # type: ignore[import-untyped]

-from tests.conftest import HfRunner
-from tests.utils import check_embeddings_close, matryoshka_fy
+from tests.e2e.conftest import HfRunner
+from tests.e2e.utils import check_embeddings_close, matryoshka_fy


 def run_embedding_correctness_test(
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -18,14 +18,14 @@
 #
 import json
 import os
-import re

 import jsonschema
 import pytest
+import regex as re
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner

 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -85,11 +85,7 @@ def sample_json_schema():


 def check_backend(guided_decoding_backend: str):
-    if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv(
-            "VLLM_USE_V1") == "0":
-        pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.")
-    if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv(
-            "VLLM_USE_V1") == "1":
+    if guided_decoding_backend not in GuidedDecodingBackendV1:
        pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")


--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -3,7 +3,7 @@ import vllm
 from modelscope import snapshot_download  # type: ignore
 from vllm.lora.request import LoRARequest

-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner

 MODEL_PATH = "vllm-ascend/ilama-3.2-1B"

--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -30,7 +30,7 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset

 import vllm_ascend  # noqa: F401
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner

 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",