ACLgraph enable: Test cases revisions for all features (#3388)
### What this PR does / why we need it? This PR revise the test cases of various features on the warehouse which add the enablement of aclgraph to the test cases. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
@@ -71,7 +71,7 @@ def test_ngram_correctness(
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
pytest.skip("Not current support for the test.")
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
with VllmRunner(model_name,
|
||||
@@ -82,7 +82,7 @@ def test_ngram_correctness(
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=True) as runner:
|
||||
enforce_eager=False) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
@@ -111,7 +111,7 @@ def test_eagle_correctness(
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
@@ -129,7 +129,7 @@ def test_eagle_correctness(
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
) as runner:
|
||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||
|
||||
|
||||
@@ -9,7 +9,8 @@ from tests.e2e.model_utils import check_outputs_equal
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill():
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_concurrent_partial_prefill(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
@@ -18,7 +19,7 @@ def test_concurrent_partial_prefill():
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=2048,
|
||||
enforce_eager=True,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||
@@ -28,7 +29,8 @@ def test_concurrent_partial_prefill():
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded():
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
@@ -37,7 +39,7 @@ def test_prefix_cache_stats_is_recorded():
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=2048,
|
||||
enforce_eager=True,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
|
||||
@@ -74,7 +74,7 @@ def test_end_to_end():
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
enable_sleep_mode=True) as runner:
|
||||
|
||||
output = runner.model.generate(prompt, sampling_params)
|
||||
|
||||
@@ -43,12 +43,13 @@ def test_models(
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
with VllmRunner(model, long_prefill_token_threshold=20,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(model,
|
||||
long_prefill_token_threshold=20,
|
||||
enforce_eager=False) as vllm_model:
|
||||
output1 = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(model,
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True
|
||||
|
||||
@@ -29,7 +29,7 @@ def test_embed_models_correctness():
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
task="embed",
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
) as vllm_runner:
|
||||
vllm_outputs = vllm_runner.encode(queries)
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ def test_ilama_lora(ilama_lora_files):
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True) as vllm_model:
|
||||
enforce_eager=False) as vllm_model:
|
||||
|
||||
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
|
||||
@@ -28,7 +28,7 @@ def test_quant_W8A8():
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
|
||||
max_model_len=8192,
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
|
||||
@@ -46,7 +46,7 @@ def test_multimodal_vl(prompt_template):
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": 1,
|
||||
},
|
||||
enforce_eager=True) as vllm_model:
|
||||
enforce_eager=False) as vllm_model:
|
||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
|
||||
Reference in New Issue
Block a user