[Test] Remove VLLM_USE_V1 in example and tests (#1733)

V1 is enabled by default, no need to set it by hand now. This PR remove
the useless setting in example and tests

- vLLM version: v0.9.2
- vLLM main:
9ad0a4588b

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-15 12:49:57 +08:00
committed by GitHub
parent eb921d2b6f
commit 787010a637
29 changed files with 186 additions and 291 deletions

View File

@@ -1,15 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import os
import pytest
import torch
from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
PROMPT = "Hello my name is Robert and I"

View File

@@ -9,8 +9,8 @@ Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
"""
import pytest
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"Qwen/Qwen3-0.6B-Base",

View File

@@ -53,7 +53,6 @@ def model_name():
@pytest.mark.skipif(
True, reason="TODO: Enable me after test_mtp_correctness is fixed")
def test_mtp_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
@@ -62,33 +61,30 @@ def test_mtp_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using mtp speculative decoding.
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
spec_llm = LLM(model=model_name,
trust_remote_code=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
max_model_len=256,
enforce_eager=True)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
spec_llm = LLM(model=model_name,
trust_remote_code=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
},
max_model_len=256,
enforce_eager=True)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm

View File

@@ -60,7 +60,6 @@ def eagle3_model_name():
def test_ngram_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
@@ -70,44 +69,40 @@ def test_ngram_correctness(
should be the same when using ngram speculative decoding.
'''
pytest.skip("Not current support for the test.")
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
spec_llm = LLM(
model=model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
enforce_eager=True,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
spec_llm = LLM(
model=model_name,
speculative_config={
"method": "ngram",
"prompt_lookup_max": 5,
"prompt_lookup_min": 3,
"num_speculative_tokens": 3,
},
max_model_len=1024,
enforce_eager=True,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
@@ -119,43 +114,40 @@ def test_eagle_correctness(
'''
if not use_eagle3:
pytest.skip("Not current support for the test.")
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
spec_model_name = eagle3_model_name(
) if use_eagle3 else eagle_model_name()
spec_llm = LLM(
model=model_name,
trust_remote_code=True,
enable_chunked_prefill=True,
max_num_seqs=1,
max_num_batched_tokens=2048,
gpu_memory_utilization=0.6,
speculative_config={
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 2,
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=True,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
spec_llm = LLM(
model=model_name,
trust_remote_code=True,
enable_chunked_prefill=True,
max_num_seqs=1,
max_num_batched_tokens=2048,
gpu_memory_utilization=0.6,
speculative_config={
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 2,
"max_model_len": 128,
},
max_model_len=128,
enforce_eager=True,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm

View File

@@ -20,14 +20,12 @@ Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`.
"""
import os
import pytest
import torch
from vllm import LLM, SamplingParams
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
@@ -36,37 +34,29 @@ MODELS = [
]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
model: str,
max_tokens: int,
monkeypatch: pytest.MonkeyPatch,
) -> None:
with monkeypatch.context() as m:
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
# aclgraph only support on v1
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
# TODO: change to use vllmrunner when the registry of custom op is solved
# while running pytest
vllm_model = LLM(model)
vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
del vllm_model
torch.npu.empty_cache()
sampling_params = SamplingParams(max_tokens=max_tokens,
temperature=0.0)
# TODO: change to use vllmrunner when the registry of custom op is solved
# while running pytest
vllm_model = LLM(model)
vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
del vllm_model
torch.npu.empty_cache()
vllm_model = LLM(model, enforce_eager=True)
vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
del vllm_model
torch.npu.empty_cache()
vllm_model = LLM(model, enforce_eager=True)
vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
del vllm_model
torch.npu.empty_cache()
vllm_aclgraph_outputs_list = []
for output in vllm_aclgraph_outputs:
@@ -86,12 +76,9 @@ def test_models(
)
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError) as excinfo:
VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
max_model_len=1024,

View File

@@ -21,7 +21,7 @@ import torch
from vllm import LLM, SamplingParams
from vllm.utils import GiB_bytes
from tests.utils import fork_new_process_for_each_test
from tests.e2e.utils import fork_new_process_for_each_test
from vllm_ascend.device_allocator.camem import CaMemAllocator

View File

@@ -20,8 +20,6 @@ Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/compile/test_aclgraph.py`.
"""
import os
import pytest
import torch
from vllm import LLM, SamplingParams
@@ -29,8 +27,6 @@ from vllm import LLM, SamplingParams
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="new chunked only support on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
def test_models(
@@ -39,36 +35,33 @@ def test_models(
monkeypatch: pytest.MonkeyPatch,
) -> None:
return
with monkeypatch.context() as m:
prompts = "The president of the United States is"
m.setenv("VLLM_USE_V1", "1")
prompts = "The president of the United States is"
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=0.0,
)
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=0.0,
)
vllm_model = LLM(model,
long_prefill_token_threshold=4,
enforce_eager=True)
output_chunked = vllm_model.generate(prompts, sampling_params)
logprobs_chunked = output_chunked.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True)
output_chunked = vllm_model.generate(prompts, sampling_params)
logprobs_chunked = output_chunked.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
vllm_model = LLM(model,
enforce_eager=True,
additional_config={
'ascend_scheduler_config': {
'enabled': True
},
})
output = vllm_model.generate(prompts, sampling_params)
logprobs = output.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
vllm_model = LLM(model,
enforce_eager=True,
additional_config={
'ascend_scheduler_config': {
'enabled': True
},
})
output = vllm_model.generate(prompts, sampling_params)
logprobs = output.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
logprobs_similarity = torch.cosine_similarity(
logprobs_chunked.flatten(), logprobs.flatten(), dim=0)
assert logprobs_similarity > 0.95
logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(),
logprobs.flatten(),
dim=0)
assert logprobs_similarity > 0.95

View File

@@ -21,8 +21,8 @@ from typing import Optional
from modelscope import snapshot_download # type: ignore[import-untyped]
from tests.conftest import HfRunner
from tests.utils import check_embeddings_close, matryoshka_fy
from tests.e2e.conftest import HfRunner
from tests.e2e.utils import check_embeddings_close, matryoshka_fy
def run_embedding_correctness_test(

View File

@@ -18,14 +18,14 @@
#
import json
import os
import re
import jsonschema
import pytest
import regex as re
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from tests.conftest import VllmRunner
from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -85,11 +85,7 @@ def sample_json_schema():
def check_backend(guided_decoding_backend: str):
if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv(
"VLLM_USE_V1") == "0":
pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.")
if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv(
"VLLM_USE_V1") == "1":
if guided_decoding_backend not in GuidedDecodingBackendV1:
pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")

View File

@@ -3,7 +3,7 @@ import vllm
from modelscope import snapshot_download # type: ignore
from vllm.lora.request import LoRARequest
from tests.conftest import VllmRunner
from tests.e2e.conftest import VllmRunner
MODEL_PATH = "vllm-ascend/ilama-3.2-1B"

View File

@@ -30,7 +30,7 @@ from vllm import SamplingParams
from vllm.assets.image import ImageAsset
import vllm_ascend # noqa: F401
from tests.conftest import VllmRunner
from tests.e2e.conftest import VllmRunner
MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",