### What this PR does / why we need it?
| File Path |
| :--- |
| `tests/e2e/singlecard/compile/backend.py` |
| `tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py` |
| `tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py` |
| `tests/e2e/singlecard/compile/test_norm_quant_fusion.py` |
| `tests/e2e/singlecard/model_runner_v2/test_basic.py` |
| `tests/e2e/singlecard/test_aclgraph_accuracy.py` |
| `tests/e2e/singlecard/test_aclgraph_batch_invariant.py` |
| `tests/e2e/singlecard/test_aclgraph_mem.py` |
| `tests/e2e/singlecard/test_async_scheduling.py` |
| `tests/e2e/singlecard/test_auto_fit_max_mode_len.py` |
| `tests/e2e/singlecard/test_batch_invariant.py` |
| `tests/e2e/singlecard/test_camem.py` |
| `tests/e2e/singlecard/test_completion_with_prompt_embeds.py` |
| `tests/e2e/singlecard/test_cpu_offloading.py` |
| `tests/e2e/singlecard/test_guided_decoding.py` |
| `tests/e2e/singlecard/test_ilama_lora.py` |
| `tests/e2e/singlecard/test_llama32_lora.py` |
| `tests/e2e/singlecard/test_models.py` |
| `tests/e2e/singlecard/test_multistream_overlap_shared_expert.py` |
| `tests/e2e/singlecard/test_quantization.py` |
| `tests/e2e/singlecard/test_qwen3_multi_loras.py` |
| `tests/e2e/singlecard/test_sampler.py` |
| `tests/e2e/singlecard/test_vlm.py` |
| `tests/e2e/singlecard/test_xlite.py` |
| `tests/e2e/singlecard/utils.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
|
||||
|
||||
first_prompt = ("The following numbers of the sequence " +
|
||||
", ".join(str(i) for i in range(10)) + " are:")
|
||||
first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:"
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@@ -31,7 +30,9 @@ default_params = dict(
|
||||
)
|
||||
|
||||
|
||||
def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ):
|
||||
def test_without_spec_decoding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test consistency of combos of async scheduling, preemption,
|
||||
uni/multiproc executor, prefill chunking."""
|
||||
test_sampling_params: list[dict[str, Any]] = [
|
||||
@@ -85,11 +86,11 @@ def run_tests(
|
||||
# avoid precision errors
|
||||
outputs: list[tuple[str, list, list]] = []
|
||||
for n, (
|
||||
test_preemption,
|
||||
executor,
|
||||
async_scheduling,
|
||||
spec_config,
|
||||
test_prefill_chunking,
|
||||
test_preemption,
|
||||
executor,
|
||||
async_scheduling,
|
||||
spec_config,
|
||||
test_prefill_chunking,
|
||||
) in enumerate(test_configs, 1):
|
||||
test_str = f"{n}/{len(test_configs)}"
|
||||
test_results = run_test(
|
||||
@@ -105,21 +106,18 @@ def run_tests(
|
||||
outputs.append(test_results)
|
||||
|
||||
baseline_config, baseline_tests, _ = outputs[0]
|
||||
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None),
|
||||
(None, None, None))
|
||||
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None))
|
||||
|
||||
print(
|
||||
f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}"
|
||||
)
|
||||
print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
|
||||
|
||||
failure = None
|
||||
for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
|
||||
for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip(
|
||||
baseline_tests,
|
||||
baseline_acceptances or repeat(None),
|
||||
test_outputs,
|
||||
test_acceptance_rates or repeat(None),
|
||||
test_sampling_params,
|
||||
baseline_tests,
|
||||
baseline_acceptances or repeat(None),
|
||||
test_outputs,
|
||||
test_acceptance_rates or repeat(None),
|
||||
test_sampling_params,
|
||||
):
|
||||
try:
|
||||
check_outputs_equal(
|
||||
@@ -129,21 +127,18 @@ def run_tests(
|
||||
name_1=f"config=[{test_config}], params={params}",
|
||||
)
|
||||
|
||||
if (base_acceptance_rate is not None
|
||||
and test_acceptance_rate is not None):
|
||||
if base_acceptance_rate is not None and test_acceptance_rate is not None:
|
||||
if "spec_mml=None" in test_config:
|
||||
assert (test_acceptance_rate > base_acceptance_rate
|
||||
or test_acceptance_rate == pytest.approx(
|
||||
base_acceptance_rate, rel=5e-2))
|
||||
assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx(
|
||||
base_acceptance_rate, rel=5e-2
|
||||
)
|
||||
else:
|
||||
# Currently the reported acceptance rate is expected to be
|
||||
# lower when we sometimes skip drafting altogether.
|
||||
assert test_acceptance_rate > 0.1
|
||||
print(f"PASSED: config=[{test_config}], params={params}"
|
||||
f" accept_rate={test_acceptance_rate}")
|
||||
print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
|
||||
except AssertionError as e:
|
||||
print(f"FAILED: config=[{test_config}], params={params}"
|
||||
f" accept_rate={test_acceptance_rate}")
|
||||
print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
|
||||
if failure is None:
|
||||
failure = e
|
||||
|
||||
@@ -161,33 +156,35 @@ def run_test(
|
||||
spec_config: dict[str, Any] | None,
|
||||
test_prefill_chunking: bool,
|
||||
):
|
||||
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
spec_decoding = spec_config is not None
|
||||
cache_arg: dict[str, Any] = (
|
||||
# Force preemptions
|
||||
dict(num_gpu_blocks_override=2) if test_preemption else dict(
|
||||
gpu_memory_utilization=0.9))
|
||||
dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9)
|
||||
)
|
||||
spec_mml = (spec_config or {}).get("max_model_len")
|
||||
test_config = (f"executor={executor}, preemption={test_preemption}, "
|
||||
f"async_sched={async_scheduling}, "
|
||||
f"chunk_prefill={test_prefill_chunking}, "
|
||||
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}")
|
||||
test_config = (
|
||||
f"executor={executor}, preemption={test_preemption}, "
|
||||
f"async_sched={async_scheduling}, "
|
||||
f"chunk_prefill={test_prefill_chunking}, "
|
||||
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
|
||||
)
|
||||
print("-" * 80)
|
||||
print(f"---- TESTING {test_str}: {test_config}")
|
||||
print("-" * 80)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
enable_chunked_prefill=test_prefill_chunking,
|
||||
# Force prefill chunking
|
||||
max_num_batched_tokens=48 if test_prefill_chunking else None,
|
||||
enforce_eager=True,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=executor,
|
||||
dtype="float16", # avoid precision errors
|
||||
speculative_config=spec_config,
|
||||
disable_log_stats=False,
|
||||
**cache_arg,
|
||||
model,
|
||||
max_model_len=512,
|
||||
enable_chunked_prefill=test_prefill_chunking,
|
||||
# Force prefill chunking
|
||||
max_num_batched_tokens=48 if test_prefill_chunking else None,
|
||||
enforce_eager=True,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=executor,
|
||||
dtype="float16", # avoid precision errors
|
||||
speculative_config=spec_config,
|
||||
disable_log_stats=False,
|
||||
**cache_arg,
|
||||
) as vllm_model:
|
||||
results = []
|
||||
acceptance_rates: list[float] | None = [] if spec_decoding else None
|
||||
@@ -197,26 +194,23 @@ def run_test(
|
||||
results.append(
|
||||
vllm_model.generate(
|
||||
example_prompts,
|
||||
sampling_params=SamplingParams(**default_params,
|
||||
**override_params),
|
||||
))
|
||||
sampling_params=SamplingParams(**default_params, **override_params),
|
||||
)
|
||||
)
|
||||
metrics_after = vllm_model.model.get_metrics()
|
||||
if acceptance_rates is not None:
|
||||
acceptance_rate = _get_acceptance_rate(metrics_before,
|
||||
metrics_after)
|
||||
acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
|
||||
acceptance_rates.append(acceptance_rate)
|
||||
print(f"ACCEPTANCE RATE {acceptance_rate}")
|
||||
|
||||
if test_preemption:
|
||||
preemptions = _get_count(metrics_before, metrics_after,
|
||||
"vllm:num_preemptions")
|
||||
preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions")
|
||||
assert preemptions > 0, "preemption test had no preemptions"
|
||||
|
||||
if len(results) > 1:
|
||||
# First check that the different parameter configs
|
||||
# actually result in different output.
|
||||
for other_test_outs, params in zip(results[1:],
|
||||
sampling_param_tests[1:]):
|
||||
for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]):
|
||||
with pytest.raises(AssertionError):
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=results[0][0],
|
||||
|
||||
Reference in New Issue
Block a user