[Lint]Style: Convert test/ to ruff format(Batch #5) (#6747)

### What this PR does / why we need it?
| File Path |
| :--- |
| `tests/e2e/singlecard/compile/backend.py` |
| `tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py` |
| `tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py` |
| `tests/e2e/singlecard/compile/test_norm_quant_fusion.py` |
| `tests/e2e/singlecard/model_runner_v2/test_basic.py` |
| `tests/e2e/singlecard/test_aclgraph_accuracy.py` |
| `tests/e2e/singlecard/test_aclgraph_batch_invariant.py` |
| `tests/e2e/singlecard/test_aclgraph_mem.py` |
| `tests/e2e/singlecard/test_async_scheduling.py` |
| `tests/e2e/singlecard/test_auto_fit_max_mode_len.py` |
| `tests/e2e/singlecard/test_batch_invariant.py` |
| `tests/e2e/singlecard/test_camem.py` |
| `tests/e2e/singlecard/test_completion_with_prompt_embeds.py` |
| `tests/e2e/singlecard/test_cpu_offloading.py` |
| `tests/e2e/singlecard/test_guided_decoding.py` |
| `tests/e2e/singlecard/test_ilama_lora.py` |
| `tests/e2e/singlecard/test_llama32_lora.py` |
| `tests/e2e/singlecard/test_models.py` |
| `tests/e2e/singlecard/test_multistream_overlap_shared_expert.py` |
| `tests/e2e/singlecard/test_quantization.py` |
| `tests/e2e/singlecard/test_qwen3_multi_loras.py` |
| `tests/e2e/singlecard/test_sampler.py` |
| `tests/e2e/singlecard/test_vlm.py` |
| `tests/e2e/singlecard/test_xlite.py` |
| `tests/e2e/singlecard/utils.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
SILONG ZENG
2026-02-24 15:50:00 +08:00
committed by GitHub
parent 747484cb64
commit 62ea664aa7
26 changed files with 859 additions and 1052 deletions

View File

@@ -14,8 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections.abc import Callable, Sequence
from copy import deepcopy
from typing import Any, Callable, List, Optional, Sequence
from typing import Any
import torch.fx as fx
from torch._inductor.decomposition import select_decomp_table
@@ -37,7 +38,7 @@ class TestBackend:
records the FX graph before and after the transformation.
"""
def __init__(self, custom_passes: Optional[List[Any]] = None):
def __init__(self, custom_passes: list[Any] | None = None):
vllm_config = get_current_vllm_config()
compile_config = vllm_config.compilation_config
self.inductor_config = compile_config.inductor_compile_config
@@ -48,9 +49,7 @@ class TestBackend:
self.graph_pre_pass = None
self.graph_post_pass = None
def post_pass(self,
graph: fx.Graph,
runtime_shape: int | None = None) -> fx.Graph:
def post_pass(self, graph: fx.Graph, runtime_shape: int | None = None) -> fx.Graph:
"""
Apply custom graph transformation passes.
"""
@@ -62,13 +61,13 @@ class TestBackend:
return graph
def compile(
self,
graph: fx.GraphModule,
example_inputs: list[Any],
compiler_config: dict[str, Any],
runtime_shape: Optional[int] = None,
key: Optional[str] = None
) -> tuple[Optional[Callable], Optional[Any]]:
self,
graph: fx.GraphModule,
example_inputs: list[Any],
compiler_config: dict[str, Any],
runtime_shape: int | None = None,
key: str | None = None,
) -> tuple[Callable | None, Any | None]:
"""
Compile the FX graph using vLLM's Ascend compiler interface.
Wraps the post-pass logic into the inner_compile callback.
@@ -87,8 +86,7 @@ class TestBackend:
)
return compiled_fn, None
def __call__(self, gm: fx.GraphModule,
example_inputs: Optional[List[Any]]):
def __call__(self, gm: fx.GraphModule, example_inputs: list[Any] | None):
"""
Make the backend callable by torch.compile().
Returns a compiled executable function.
@@ -103,17 +101,11 @@ class TestBackend:
)
return compiled_fn
def find_nodes_by_target(self, graph: fx.GraphModule,
target: OpOverload) -> List[fx.Node]:
def find_nodes_by_target(self, graph: fx.GraphModule, target: OpOverload) -> list[fx.Node]:
"""Helper to find all FX nodes that call a specific operator."""
return [
node for node in graph.graph.nodes
if hasattr(node, 'target') and node.target == target
]
return [node for node in graph.graph.nodes if hasattr(node, "target") and node.target == target]
def check_before_ops(self,
ops: Sequence[OpOverload],
fully_replaced: bool = True):
def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced: bool = True):
"""
Verify that the original (unfused) operators exist before the pass
and are fully removed afterward (if fully_replaced=True).

View File

@@ -215,6 +215,7 @@ def register_pattern_safe(pattern_class, vllm_config, eps, pattern_key):
try:
# Import the required pass class
from torch._inductor.pattern_matcher import PatternMatcherPass
pm_pass = PatternMatcherPass()
pattern.register(pm_pass)
_registered_patterns.add(pattern_key)
@@ -243,7 +244,7 @@ def test_rmsnorm_quant_fusion(
sp_enable: bool,
):
# Check if fusion operator is available
if not hasattr(torch.ops.npu, 'npu_add_rms_norm_quant'):
if not hasattr(torch.ops.npu, "npu_add_rms_norm_quant"):
pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test")
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
@@ -266,7 +267,7 @@ def test_rmsnorm_quant_fusion(
if not enable_custom_op():
pytest.skip("Custom ops not available, skipping bias test")
# Check if the bias operator exists
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test")
if sp_enable:
model = ModelSPWithBias(hidden_size, dtype, eps, device="npu")
@@ -281,13 +282,11 @@ def test_rmsnorm_quant_fusion(
else:
# The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching
# so we need to skip if it's not available
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test")
if sp_enable:
model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
register_pattern_safe(
AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern"
)
register_pattern_safe(AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern")
else:
model = ModelWithoutBias(hidden_size, dtype, eps, device="npu")
register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern")
@@ -302,5 +301,9 @@ def test_rmsnorm_quant_fusion(
compiled_out, compiled_res = compiled_model(x)
# Verify output shapes are correct
assert compiled_out.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
assert compiled_res.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
assert compiled_out.shape == (num_tokens, hidden_size), (
f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
)
assert compiled_res.shape == (num_tokens, hidden_size), (
f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
)

View File

@@ -201,6 +201,7 @@ def test_rmsnorm_quant_fusion(
vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps
)
from torch._inductor.pattern_matcher import PatternMatcherPass
pm_pass = PatternMatcherPass()
fusion_pattern.register(pm_pass)
model = model.to("npu")

View File

@@ -14,25 +14,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import pytest
import torch
import torch.nn as nn
import torch_npu
import vllm.config
from vllm.config import ModelConfig, VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
from vllm.utils.system_utils import update_environment_variables
import vllm_ascend.ops.register_custom_ops # noqa
from tests.e2e.singlecard.compile.backend import TestBackend
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \
AddRMSNormQuantFusionPass
from vllm_ascend.utils import enable_custom_op
from vllm_ascend.utils import vllm_version_is
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
from vllm_ascend.utils import enable_custom_op, vllm_version_is
if vllm_version_is("0.15.0"):
from vllm.compilation.fx_utils import OpOverload # type: ignore
@@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config):
"""Get or create backend with fusion passes (cached to avoid duplicate pattern registration)."""
global _backend_cache
if _backend_cache is None:
_backend_cache = TestBackend(custom_passes=[
AddRMSNormQuantFusionPass(vllm_config=vllm_config)
])
_backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)])
return _backend_cache
class TestModelWithoutBias(nn.Module):
"""
A minimal test model that simulates the pattern:
AddRMSNorm → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module):
residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps)
x, residual, self.rms_norm_weight, None, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module):
AddRMSNorm → Add Bias → Quantization (with bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module):
residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps)
x, residual, self.rms_norm_weight, self.bias, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module):
AddRMSNorm → maybe_allgather → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module):
residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps)
x, residual, self.rms_norm_weight, None, self.eps
)
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
norm_output, True)
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True)
quantized_output = torch.ops.vllm.quantize(norm_output,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default
torch.ops.vllm.quantize.default,
]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
class TestModelSPWithBias(nn.Module):
@@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module):
AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias)
"""
def __init__(self,
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
super().__init__()
self.hidden_size = hidden_size
self.eps = eps
self.rms_norm_weight = nn.Parameter(
torch.randn(hidden_size, device=device))
self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size,
dtype=dtype,
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
def forward(self, x):
"""
@@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module):
residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps)
x, residual, self.rms_norm_weight, self.bias, self.eps
)
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
norm_output_with_bias, True)
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
self.quant_scale,
self.quant_scale_reciprocal,
self.quant_offset)
quantized_output = torch.ops.vllm.quantize(
norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
)
return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]:
def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion."""
return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default
torch.ops.vllm.quantize.default,
]
def ops_in_model_after(self) -> List[OpOverload]:
def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion."""
return [
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion(
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
with vllm.config.set_current_vllm_config(vllm_config):
update_environment_variables({
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
})
update_environment_variables(
{
"RANK": "0",
"LOCAL_RANK": "0",
"WORLD_SIZE": "1",
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
}
)
init_distributed_environment()
ensure_model_parallel_initialized(1, 1)
with vllm.config.set_current_vllm_config(vllm_config):
with set_ascend_forward_context(None, vllm_config):
backend = get_or_create_backend(vllm_config)
if use_bias:
if not enable_custom_op():
return
if sp_enable:
model = TestModelSPWithBias(hidden_size,
dtype,
eps,
device="npu")
else:
model = TestModelWithBias(hidden_size,
dtype,
eps,
device="npu")
with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config):
backend = get_or_create_backend(vllm_config)
if use_bias:
if not enable_custom_op():
return
if sp_enable:
model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu")
else:
if sp_enable:
model = TestModelSPWithoutBias(hidden_size,
dtype,
eps,
device="npu")
else:
model = TestModelWithoutBias(hidden_size,
dtype,
eps,
device="npu")
model = model.to("npu")
model = TestModelWithBias(hidden_size, dtype, eps, device="npu")
else:
if sp_enable:
model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
else:
model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu")
model = model.to("npu")
x = torch.rand(num_tokens,
hidden_size,
device="npu",
dtype=dtype,
requires_grad=False)
x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False)
result_unfused = model(x)
print("Unfused result:", [t.shape for t in result_unfused])
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
print("Fused result:", [t.shape for t in result_fused])
result_unfused = model(x)
print("Unfused result:", [t.shape for t in result_unfused])
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
print("Fused result:", [t.shape for t in result_fused])
print("=== Checking operator fusion ===")
backend.check_before_ops(model.ops_in_model_before(),
fully_replaced=not sp_enable)
backend.check_after_ops(model.ops_in_model_after())
print("=== Checking operator fusion ===")
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable)
backend.check_after_ops(model.ops_in_model_after())

View File

@@ -47,9 +47,9 @@ def test_qwen3_dense_eager_mode(
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=enforce_eager,
model,
max_model_len=1024,
enforce_eager=enforce_eager,
) as runner:
runner.model.generate(prompts, sampling_params)
@@ -74,14 +74,14 @@ def test_egale_spec_decoding(
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=enforce_eager,
async_scheduling=True,
speculative_config={
"model": eagle_model,
"method": "eagle",
"num_speculative_tokens": 3,
},
model,
max_model_len=1024,
enforce_eager=enforce_eager,
async_scheduling=True,
speculative_config={
"model": eagle_model,
"method": "eagle",
"num_speculative_tokens": 3,
},
) as runner:
runner.model.generate(prompts, sampling_params)

View File

@@ -15,20 +15,22 @@
# limitations under the License.
#
import pytest
# ruff: noqa: E501
import os
from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
LLMTestCase, gen_and_valid)
import pytest
from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
CASE_QWEN_ACLGRAPH = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_SHORT,
golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
],
)
@@ -37,10 +39,10 @@ CASE_DS_ACLGRAPH = LLMTestCase(
quantization="ascend",
prompts=PROMPTS_SHORT,
golden_answers=[
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
' here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of'
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
" here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of",
],
)
@@ -49,9 +51,9 @@ CASE_QWEN_FULL = LLMTestCase(
prompts=PROMPTS_SHORT,
golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
" Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
],
)
@@ -60,10 +62,10 @@ CASE_DS_FULL = LLMTestCase(
quantization="ascend",
prompts=PROMPTS_SHORT,
golden_answers=[
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
' here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of'
"\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
" a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
" Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
" here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of",
],
)
@@ -71,10 +73,11 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_LONG,
golden_answers=[
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
])
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
],
)
CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
@@ -83,26 +86,31 @@ CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
golden_answers=[
"\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
])
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
],
)
CASE_QWEN_EX = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_LONG,
golden_answers=[
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
" \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
])
" \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
],
)
CASE_DS_EX = LLMTestCase(
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
quantization="ascend",
prompts=PROMPTS_LONG,
golden_answers=[
"\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
],
)
CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
quantization="ascend",
prompts=PROMPTS_LONG,
golden_answers=[
"\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
])
@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
def test_piecewise_res_consistency(cur_case: LLMTestCase):
@@ -112,51 +120,48 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
"cudagraph_capture_sizes": [1, 2, 4, 8],
"quantization": cur_case.quantization,
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
runner_kwargs = {
"model_name": cur_case.model,
"max_model_len": 1024,
"compilation_config": {
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"quantization": cur_case.quantization,
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
runner_kwargs = {
"model_name": cur_case.model,
"max_model_len": 1024,
"compilation_config": {
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"quantization": cur_case.quantization,
"additional_config": {
"npugraph_ex_config": {
"enable": False
}
},
"additional_config": {"npugraph_ex_config": {"enable": False}},
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
@@ -165,20 +170,16 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
"model_name": cur_case.model,
"quantization": cur_case.quantization,
"max_model_len": 1024,
"compilation_config": {
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"additional_config": {
"npugraph_ex_config": {
"enable": True
}
},
"compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"additional_config": {"npugraph_ex_config": {"enable": True}},
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
# The accuracy has already been verified in the previous test case.
# This test case is used to check whether the functionality works properly
@@ -190,10 +191,7 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
"model_name": cur_case.model,
"quantization": cur_case.quantization,
"max_model_len": 1024,
"compilation_config": {
"cudagraph_capture_sizes": [4, 8],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"},
"additional_config": {
"npugraph_ex_config": {
"enable": True,
@@ -201,12 +199,14 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
}
},
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
# Check whether the static kernel is properly uninstall
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core')
static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
assert not os.path.exists(static_kernel_install_path)

View File

@@ -22,6 +22,7 @@ import random
import pytest
import torch
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
@@ -69,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
if target_words > 50:
# For longer prompts, repeat context
padding_text = (
" This is an interesting topic that deserves more explanation. " *
(target_words // 50))
padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
base_prompt = base_prompt + padding_text
return base_prompt
@@ -107,8 +106,7 @@ def _extract_step_logprobs(generate_output):
@pytest.mark.timeout(1000)
def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
monkeypatch: pytest.MonkeyPatch):
def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
"""
Ensures that the same request (the 'needle' prompt) yields identical output
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -162,20 +160,16 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
needle_prompt = "There once was a "
with VllmRunner(
model_name=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False,
distributed_executor_backend="mp",
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
}
model_name=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False,
distributed_executor_backend="mp",
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
) as vllm_model:
# Baseline generation for the needle prompt alone.
baseline_out = vllm_model.generate([needle_prompt], sampling)
assert len(baseline_out) == 1
@@ -194,8 +188,7 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(
_random_prompt(min_random_prompt, max_random_prompt))
prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
# Generate with the larger-batch engine
outputs = vllm_model.generate(prompts, sampling)
@@ -204,24 +197,23 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
text = needle_output[0]
if text != baseline_text:
print(
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
mismatches += 1
passes = num_trials - mismatches
# Dump how many passed vs failed
print(f"[determinism] total={num_trials}, passed={passes}, "
f"failed={mismatches}, max_batch_size={max_batch_size}")
print(
f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
)
if mismatches > 0:
pytest.fail(
f"Nondeterministic outputs detected: {mismatches} failed out "
f"of {num_trials} trials (max_batch_size={max_batch_size}).")
f"of {num_trials} trials (max_batch_size={max_batch_size})."
)
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
monkeypatch: pytest.MonkeyPatch):
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = DEFAULT_MODEL
@@ -235,24 +227,19 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
if disable_custom_ar:
print(f"\n{'=' * 80}")
print(
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
)
print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
print(f"{'=' * 80}\n")
with VllmRunner(
model_name=model_name,
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
max_num_seqs=32,
max_model_len=8192,
dtype="bfloat16",
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
}
model_name=model_name,
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
max_num_seqs=32,
max_model_len=8192,
dtype="bfloat16",
gpu_memory_utilization=0.9,
distributed_executor_backend="mp",
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
) as vllm_model:
# Use more realistic prompts for better token generation
prompts = [_random_prompt(10, 50) for i in range(32)]
@@ -273,16 +260,13 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts):
print(
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
assert len(outs) == 1
# print(outs)
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -304,108 +288,91 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
failed_prompts = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)):
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)
):
if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
f"vs {len(logprobs_bsN)} (BS=N)")
failed_prompts.append({
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
failed_prompts.append(
{
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
# Check if tokens match first
if tokens_bs1 != tokens_bsN:
failed_prompts.append({
"prompt_idx":
i,
"step":
"sampling",
"reason":
"Different tokens sampled",
"prompt_preview":
prompts[i][:100],
"bs1_tokens":
tokens_bs1,
"bsN_tokens":
tokens_bsN,
"bs1_all_logprobs":
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs":
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
})
failed_prompts.append(
{
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
}
)
continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape:
failed_prompts.append({
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
failed_prompts.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item()
# Print which token failed
print(
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
)
print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}")
failed_prompts.append({
"prompt_idx":
i,
"step":
t,
"reason":
f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview":
prompts[i][:100],
"bs1_tokens":
tokens_bs1,
"bsN_tokens":
tokens_bsN,
"bs1_all_logprobs": [
logprobs_bs1[s].tolist()
for s in range(len(logprobs_bs1))
],
"bsN_all_logprobs": [
logprobs_bsN[s].tolist()
for s in range(len(logprobs_bsN))
],
})
failed_prompts.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
}
)
break
# Print summary of all failures
if failed_prompts:
print(f"\n{'=' * 80}")
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
f"{len(prompts)} prompts failed")
fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
print(fail_msg)
print(f"{'=' * 80}")
for fail in failed_prompts:
@@ -420,21 +387,18 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f" BS=N tokens: {fail['bsN_tokens']}")
if "bs1_all_logprobs" in fail:
print(
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
)
print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}")
print(
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
)
print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}")
print(f"{'=' * 80}\n")
# Fail the test with summary
msg = (f"Batch invariance violated in {len(failed_prompts)}/"
f"{len(prompts)} prompts. See output above for details.")
msg = (
f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
)
pytest.fail(msg)
@@ -446,18 +410,15 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
model = DEFAULT_MODEL
with VllmRunner(
model_name=model,
max_num_seqs=1,
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
gpu_memory_utilization=0.9,
max_model_len=2048,
dtype="float16",
enable_prefix_caching=False,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
},
distributed_executor_backend="mp",
model_name=model,
max_num_seqs=1,
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
gpu_memory_utilization=0.9,
max_model_len=2048,
dtype="float16",
enable_prefix_caching=False,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
distributed_executor_backend="mp",
) as vllm_model:
prompt = "The capital of France is"
sampling_params = SamplingParams(
@@ -479,11 +440,7 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
print(f"{'=' * 80}\n")
def test_aclgraph_logprobs_without_batch_invariance_should_fail(
monkeypatch: pytest.MonkeyPatch):
def test_aclgraph_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
"""
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -505,19 +462,15 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
print(f"{'=' * 80}\n")
with VllmRunner(
model_name=model_name,
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
max_num_seqs=32,
max_model_len=8192,
dtype="bfloat16",
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
},
distributed_executor_backend="mp",
model_name=model_name,
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
max_num_seqs=32,
max_model_len=8192,
dtype="bfloat16",
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
distributed_executor_backend="mp",
) as vllm_model:
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
long_min = int(os.getenv("VLLM_MIN_PROMPT", "768"))
long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
@@ -549,16 +502,13 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts):
print(
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -579,84 +529,90 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
differences_found = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)):
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)
):
if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
f"vs {len(logprobs_bsN)} (BS=N)")
differences_found.append({
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
differences_found.append(
{
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
# Check if tokens match first
if tokens_bs1 != tokens_bsN:
differences_found.append({
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape:
differences_found.append({
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item()
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
f"Token {t}: max_diff={max_diff:.6e}")
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}")
differences_found.append({
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
# Print summary
print(f"\n{'=' * 80}")
if differences_found:
success_msg = (
f"✓ SUCCESS: Batch invariance is doing something! "
f"Found {len(differences_found)}/{len(prompts)} prompts "
f"with differences when batch invariance was DISABLED.")
f"with differences when batch invariance was DISABLED."
)
print(success_msg)
print(f"{'=' * 80}")
for diff in differences_found:
@@ -676,7 +632,8 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
f"between BS=1 and BS=N even with batch invariance DISABLED. "
f"This suggests batch invariance might not be necessary, "
f"or the test needs more sensitive prompts.")
f"or the test needs more sensitive prompts."
)
print(fail_msg)
print(f"{'=' * 80}\n")
pytest.fail(fail_msg)

View File

@@ -40,7 +40,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
capture_mem_after = multiprocessing.Value("q", -1) # long long
def capture_model_wrapper(original_method):
def wrapped(self):
mem_before = torch.npu.mem_get_info()[0] # free memory
result = original_method(self)
@@ -55,19 +54,16 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
original_capture = NPUModelRunner.capture_model
with patch.object(NPUModelRunner,
'capture_model',
new=capture_model_wrapper(original_capture)):
with patch.object(NPUModelRunner, "capture_model", new=capture_model_wrapper(original_capture)):
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=max_tokens,
temperature=0.0)
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
vllm_model = VllmRunner(model,
max_model_len=1024,
quantization="ascend")
vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend")
else:
vllm_model = VllmRunner(model)
_ = vllm_model.generate(prompts, sampling_params)
@@ -94,5 +90,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
assert mem_used_by_capture < max_mem_expected, (
f"capture_model used more memory than expected. "
f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
f"Expected: < {max_capture_mem_gib:.2f} GiB")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
f"Expected: < {max_capture_mem_gib:.2f} GiB"
)
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B"
MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
first_prompt = ("The following numbers of the sequence " +
", ".join(str(i) for i in range(10)) + " are:")
first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:"
example_prompts = [
"Hello, my name is",
"The president of the United States is",
@@ -31,7 +30,9 @@ default_params = dict(
)
def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ):
def test_without_spec_decoding(
monkeypatch: pytest.MonkeyPatch,
):
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor, prefill chunking."""
test_sampling_params: list[dict[str, Any]] = [
@@ -85,11 +86,11 @@ def run_tests(
# avoid precision errors
outputs: list[tuple[str, list, list]] = []
for n, (
test_preemption,
executor,
async_scheduling,
spec_config,
test_prefill_chunking,
test_preemption,
executor,
async_scheduling,
spec_config,
test_prefill_chunking,
) in enumerate(test_configs, 1):
test_str = f"{n}/{len(test_configs)}"
test_results = run_test(
@@ -105,21 +106,18 @@ def run_tests(
outputs.append(test_results)
baseline_config, baseline_tests, _ = outputs[0]
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None),
(None, None, None))
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None))
print(
f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}"
)
print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
failure = None
for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip(
baseline_tests,
baseline_acceptances or repeat(None),
test_outputs,
test_acceptance_rates or repeat(None),
test_sampling_params,
baseline_tests,
baseline_acceptances or repeat(None),
test_outputs,
test_acceptance_rates or repeat(None),
test_sampling_params,
):
try:
check_outputs_equal(
@@ -129,21 +127,18 @@ def run_tests(
name_1=f"config=[{test_config}], params={params}",
)
if (base_acceptance_rate is not None
and test_acceptance_rate is not None):
if base_acceptance_rate is not None and test_acceptance_rate is not None:
if "spec_mml=None" in test_config:
assert (test_acceptance_rate > base_acceptance_rate
or test_acceptance_rate == pytest.approx(
base_acceptance_rate, rel=5e-2))
assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx(
base_acceptance_rate, rel=5e-2
)
else:
# Currently the reported acceptance rate is expected to be
# lower when we sometimes skip drafting altogether.
assert test_acceptance_rate > 0.1
print(f"PASSED: config=[{test_config}], params={params}"
f" accept_rate={test_acceptance_rate}")
print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
except AssertionError as e:
print(f"FAILED: config=[{test_config}], params={params}"
f" accept_rate={test_acceptance_rate}")
print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
if failure is None:
failure = e
@@ -161,33 +156,35 @@ def run_test(
spec_config: dict[str, Any] | None,
test_prefill_chunking: bool,
):
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = (
# Force preemptions
dict(num_gpu_blocks_override=2) if test_preemption else dict(
gpu_memory_utilization=0.9))
dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9)
)
spec_mml = (spec_config or {}).get("max_model_len")
test_config = (f"executor={executor}, preemption={test_preemption}, "
f"async_sched={async_scheduling}, "
f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}")
test_config = (
f"executor={executor}, preemption={test_preemption}, "
f"async_sched={async_scheduling}, "
f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
)
print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)
with VllmRunner(
model,
max_model_len=512,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
enforce_eager=True,
async_scheduling=async_scheduling,
distributed_executor_backend=executor,
dtype="float16", # avoid precision errors
speculative_config=spec_config,
disable_log_stats=False,
**cache_arg,
model,
max_model_len=512,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
enforce_eager=True,
async_scheduling=async_scheduling,
distributed_executor_backend=executor,
dtype="float16", # avoid precision errors
speculative_config=spec_config,
disable_log_stats=False,
**cache_arg,
) as vllm_model:
results = []
acceptance_rates: list[float] | None = [] if spec_decoding else None
@@ -197,26 +194,23 @@ def run_test(
results.append(
vllm_model.generate(
example_prompts,
sampling_params=SamplingParams(**default_params,
**override_params),
))
sampling_params=SamplingParams(**default_params, **override_params),
)
)
metrics_after = vllm_model.model.get_metrics()
if acceptance_rates is not None:
acceptance_rate = _get_acceptance_rate(metrics_before,
metrics_after)
acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
acceptance_rates.append(acceptance_rate)
print(f"ACCEPTANCE RATE {acceptance_rate}")
if test_preemption:
preemptions = _get_count(metrics_before, metrics_after,
"vllm:num_preemptions")
preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions")
assert preemptions > 0, "preemption test had no preemptions"
if len(results) > 1:
# First check that the different parameter configs
# actually result in different output.
for other_test_outs, params in zip(results[1:],
sampling_param_tests[1:]):
for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]):
with pytest.raises(AssertionError):
check_outputs_equal(
outputs_0_lst=results[0][0],

View File

@@ -42,6 +42,7 @@ def new_kv_cache_spec(
attention_chunk_size=attention_chunk_size,
)
def test_auto_fit_max_model_len():
"""Test that max_model_len=-1 auto-fits to available NPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit
@@ -59,9 +60,7 @@ def test_auto_fit_max_model_len():
# With enough memory, max_model_len stays at the derived max
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [large_available_memory]
)
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
assert vllm_config.model_config.max_model_len == 1024
# Reset for next test
@@ -73,9 +72,7 @@ def test_auto_fit_max_model_len():
# Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory = mem_per_block_per_layer * 2 * 32
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [limited_memory]
)
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [limited_memory])
# Should be reduced to fit in memory
assert vllm_config.model_config.max_model_len < 1024
assert vllm_config.model_config.max_model_len > 0
@@ -94,7 +91,5 @@ def test_auto_fit_max_model_len_not_triggered():
}
# This should work normally without auto-fit
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
)
_kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32])
assert vllm_config.model_config.max_model_len == 16

View File

@@ -70,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
if target_words > 50:
# For longer prompts, repeat context
padding_text = (
" This is an interesting topic that deserves more explanation. " *
(target_words // 50))
padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
base_prompt = base_prompt + padding_text
return base_prompt
@@ -83,10 +81,7 @@ def _extract_step_logprobs(request_output):
inner = request_output.outputs[0]
if hasattr(inner, "logprobs") and inner.logprobs is not None:
t = torch.tensor(
[
inner.logprobs[i][tid].logprob
for i, tid in enumerate(inner.token_ids)
],
[inner.logprobs[i][tid].logprob for i, tid in enumerate(inner.token_ids)],
dtype=torch.float32,
)
return t, inner.token_ids
@@ -95,8 +90,7 @@ def _extract_step_logprobs(request_output):
@pytest.mark.timeout(1000)
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
monkeypatch: pytest.MonkeyPatch):
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
"""
Ensures that the same request (the 'needle' prompt) yields identical output
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -184,8 +178,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(
_random_prompt(min_random_prompt, max_random_prompt))
prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
# Generate with the larger-batch engine
outputs = llm.generate(prompts, sampling)
@@ -196,27 +189,27 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
text = needle_output.outputs[0].text
if text != baseline_text:
print(
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
mismatches += 1
passes = num_trials - mismatches
# Dump how many passed vs failed
print(f"[determinism] total={num_trials}, passed={passes}, "
f"failed={mismatches}, max_batch_size={max_batch_size}")
print(
f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
)
if mismatches > 0:
pytest.fail(
f"Nondeterministic outputs detected: {mismatches} failed out "
f"of {num_trials} trials (max_batch_size={max_batch_size}).")
f"of {num_trials} trials (max_batch_size={max_batch_size})."
)
finally:
del llm
cleanup_dist_env_and_memory()
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
monkeypatch: pytest.MonkeyPatch):
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = DEFAULT_MODEL
@@ -230,9 +223,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
if disable_custom_ar:
print(f"\n{'=' * 80}")
print(
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
)
print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
print(f"{'=' * 80}\n")
llm = LLM(
@@ -266,15 +257,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts):
print(
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
outs = llm.generate([p], sp, use_tqdm=False)
assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -296,108 +284,92 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
failed_prompts = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)):
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)
):
if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
f"vs {len(logprobs_bsN)} (BS=N)")
failed_prompts.append({
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
failed_prompts.append(
{
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
# Check if tokens match first
if tokens_bs1 != tokens_bsN:
failed_prompts.append({
"prompt_idx":
i,
"step":
"sampling",
"reason":
"Different tokens sampled",
"prompt_preview":
prompts[i][:100],
"bs1_tokens":
tokens_bs1,
"bsN_tokens":
tokens_bsN,
"bs1_all_logprobs":
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs":
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
})
failed_prompts.append(
{
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
}
)
continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape:
failed_prompts.append({
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
failed_prompts.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item()
# Print which token failed
print(
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
)
print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}")
failed_prompts.append({
"prompt_idx":
i,
"step":
t,
"reason":
f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview":
prompts[i][:100],
"bs1_tokens":
tokens_bs1,
"bsN_tokens":
tokens_bsN,
"bs1_all_logprobs": [
logprobs_bs1[s].tolist()
for s in range(len(logprobs_bs1))
],
"bsN_all_logprobs": [
logprobs_bsN[s].tolist()
for s in range(len(logprobs_bsN))
],
})
failed_prompts.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
"bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
}
)
break
del llm
cleanup_dist_env_and_memory()
# Print summary of all failures
if failed_prompts:
print(f"\n{'=' * 80}")
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
f"{len(prompts)} prompts failed")
fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
print(fail_msg)
print(f"{'=' * 80}")
for fail in failed_prompts:
@@ -412,21 +384,18 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f" BS=N tokens: {fail['bsN_tokens']}")
if "bs1_all_logprobs" in fail:
print(
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
)
print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}")
print(
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
)
print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}")
print(f"{'=' * 80}\n")
# Fail the test with summary
msg = (f"Batch invariance violated in {len(failed_prompts)}/"
f"{len(prompts)} prompts. See output above for details.")
msg = (
f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
)
pytest.fail(msg)
@@ -476,8 +445,7 @@ def test_simple_generation(monkeypatch: pytest.MonkeyPatch):
cleanup_dist_env_and_memory()
def test_logprobs_without_batch_invariance_should_fail(
monkeypatch: pytest.MonkeyPatch):
def test_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
"""
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -540,15 +508,12 @@ def test_logprobs_without_batch_invariance_should_fail(
bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts):
print(
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
outs = llm.generate([p], sp, use_tqdm=False)
assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -569,74 +534,80 @@ def test_logprobs_without_batch_invariance_should_fail(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; "
"enable logprobs return to run this test.")
pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
differences_found = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)):
zip(
bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt,
bs1_tokens_per_prompt,
bsN_tokens_per_prompt,
)
):
if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
f"vs {len(logprobs_bsN)} (BS=N)")
differences_found.append({
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
differences_found.append(
{
"prompt_idx": i,
"step": "all",
"reason": reason,
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
# Check if tokens match first
if tokens_bs1 != tokens_bsN:
differences_found.append({
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": "sampling",
"reason": "Different tokens sampled",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape:
differences_found.append({
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item()
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
f"Token {t}: max_diff={max_diff:.6e}")
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}")
differences_found.append({
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
})
differences_found.append(
{
"prompt_idx": i,
"step": t,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN,
}
)
break
del llm
cleanup_dist_env_and_memory()
@@ -646,7 +617,8 @@ def test_logprobs_without_batch_invariance_should_fail(
success_msg = (
f"✓ SUCCESS: Batch invariance is doing something! "
f"Found {len(differences_found)}/{len(prompts)} prompts "
f"with differences when batch invariance was DISABLED.")
f"with differences when batch invariance was DISABLED."
)
print(success_msg)
print(f"{'=' * 80}")
for diff in differences_found:
@@ -666,7 +638,8 @@ def test_logprobs_without_batch_invariance_should_fail(
f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
f"between BS=1 and BS=N even with batch invariance DISABLED. "
f"This suggests batch invariance might not be necessary, "
f"or the test needs more sensitive prompts.")
f"or the test needs more sensitive prompts."
)
print(fail_msg)
print(f"{'=' * 80}\n")
pytest.fail(fail_msg)

View File

@@ -37,10 +37,7 @@ def test_end_to_end():
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with VllmRunner("Qwen/Qwen3-0.6B",
enable_sleep_mode=True,
cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True, cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
output = runner.model.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only

View File

@@ -30,9 +30,7 @@ MODELS = ["Qwen/Qwen3-0.6B"]
def get_prompt_embeds(chat, tokenizer, embedding_layer):
"""Convert chat messages to prompt embeddings."""
token_ids = tokenizer.apply_chat_template(chat,
add_generation_prompt=True,
return_tensors='pt')
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt")
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds
@@ -53,15 +51,16 @@ def test_mixed_prompt_embeds_and_text(model_name):
# Run inference with mixed inputs
with VllmRunner(
model_name,
enable_prompt_embeds=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
model_name,
enable_prompt_embeds=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_runner:
# Test prompt embeddings
embeds_output = vllm_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})
embeds_output = vllm_runner.model.generate(
{
"prompt_embeds": prompt_embeds,
}
)
# Test text prompt
text_output = vllm_runner.model.generate(text_prompt)

View File

@@ -107,15 +107,13 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
sampling_params = SamplingParams(max_tokens=1)
cpu_block_size = (llm.llm_engine.vllm_config.kv_transfer_config.
kv_connector_extra_config["block_size"])
cpu_block_size = llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config["block_size"]
subscriber.get_new_cpu_stored_events()
# prepend prompt to be cpu block aligned
prompt = "Let's count to 10. One, two, three, four,"
while (len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) %
cpu_block_size != 0):
while len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size != 0:
prompt = ". " + prompt
assert subscriber.get_new_cpu_stored_events()
@@ -123,8 +121,7 @@ def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
test_count = 100
success_count = 0
for i in range(test_count):
if (llm.generate(prompt, sampling_params,
use_tqdm=False)[0].outputs[0].text == " five"):
if llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text == " five":
success_count += 1
assert success_count >= 0.5 * test_count
@@ -143,7 +140,7 @@ def test_cpu_offloading() -> None:
"num_cpu_blocks": 1000,
"block_size": 128,
"spec_name": "NPUOffloadingSpec",
"spec_module_path": "vllm_ascend.kv_offload.npu"
"spec_module_path": "vllm_ascend.kv_offload.npu",
},
)

View File

@@ -17,7 +17,7 @@
# limitations under the License.
#
import json
from typing import Any, Dict
from typing import Any
import jsonschema
import pytest
@@ -34,8 +34,10 @@ GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
@pytest.fixture(scope="module")
def sample_regex():
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
return (
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
@pytest.fixture(scope="module")
@@ -43,66 +45,41 @@ def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {"type": "array", "items": {"type": "string", "maxLength": 10}, "minItems": 3},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"]
}
}
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"]
"required": ["name", "age", "skills", "work_history"],
}
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str,
sample_json_schema):
runner_kwargs: Dict[str, Any] = {}
def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema):
runner_kwargs: dict[str, Any] = {}
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
structured_outputs=StructuredOutputsParams(json=sample_json_schema))
temperature=1.0, max_tokens=500, structured_outputs=StructuredOutputsParams(json=sample_json_schema)
)
runner_kwargs = {
"cudagraph_capture_sizes": [1, 2, 4, 8],
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
"structured_outputs_config": {"backend": guided_decoding_backend},
}
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
] * 2
prompts = [f"Give an example JSON for an employee profile that fits this schema: {sample_json_schema}"] * 2
inputs = vllm_model.get_inputs(prompts)
outputs = vllm_model.model.generate(inputs,
sampling_params=sampling_params)
outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
assert outputs is not None
@@ -115,34 +92,27 @@ def test_guided_json_completion(guided_decoding_backend: str,
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json,
schema=sample_json_schema)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_regex(guided_decoding_backend: str, sample_regex):
if guided_decoding_backend == "outlines":
pytest.skip("Outlines doesn't support regex-based guided decoding.")
runner_kwargs: Dict[str, Any] = {}
runner_kwargs: dict[str, Any] = {}
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(regex=sample_regex))
temperature=0.8, top_p=0.95, structured_outputs=StructuredOutputsParams(regex=sample_regex)
)
runner_kwargs = {
"cudagraph_capture_sizes": [1, 2, 4, 8],
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
"structured_outputs_config": {"backend": guided_decoding_backend},
}
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2
prompts = [f"Give an example IPv4 address with this regex: {sample_regex}"] * 2
inputs = vllm_model.get_inputs(prompts)
outputs = vllm_model.model.generate(inputs,
sampling_params=sampling_params)
outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
assert outputs is not None
for output in outputs:
assert output is not None

View File

@@ -19,20 +19,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
prompts, sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -45,16 +41,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def test_ilama_lora(ilama_lora_files):
with VllmRunner(
MODEL_PATH,
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_num_seqs=16,
enforce_eager=True,
MODEL_PATH,
enable_lora=True,
dtype="half",
max_loras=4,
max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_num_seqs=16,
enforce_eager=True,
) as vllm_model:
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -1,12 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from unittest.mock import patch
import pytest
import vllm
import vllm.config
from vllm.lora.request import LoRARequest
from unittest.mock import patch
from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import enable_custom_op
@@ -53,17 +53,12 @@ def do_sample(
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
PROMPT_TEMPLATE.format(
context=
"Which poll resource provided the most number of candidate information?" # noqa: E501
context="Which poll resource provided the most number of candidate information?" # noqa: E501
),
PROMPT_TEMPLATE.format(
context=
"Return the poll resource associated with the most candidates."),
PROMPT_TEMPLATE.format(context="Return the poll resource associated with the most candidates."),
]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=64,
stop=["<|im_end|>"])
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop=["<|im_end|>"])
if tensorizer_config_dict is not None:
outputs = llm.generate(
prompts,
@@ -73,14 +68,15 @@ def do_sample(
lora_id,
lora_path,
tensorizer_config_dict=tensorizer_config_dict,
) if lora_id else None,
)
if lora_id
else None,
)
else:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
generated_texts: list[str] = []
@@ -92,33 +88,40 @@ def do_sample(
return generated_texts
def generate_and_test(llm,
llama32_lora_files,
tensorizer_config_dict: dict | None = None):
def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | None = None):
print("lora adapter created")
print("lora 1")
assert (do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1,
) == EXPECTED_LORA_OUTPUT)
assert (
do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1,
)
== EXPECTED_LORA_OUTPUT
)
print("lora 2")
assert (do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2,
) == EXPECTED_LORA_OUTPUT)
assert (
do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2,
)
== EXPECTED_LORA_OUTPUT
)
print("base model")
assert (do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0,
) == EXPECTED_BASE_MODEL_OUTPUT)
assert (
do_sample(
llm,
llama32_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0,
)
== EXPECTED_BASE_MODEL_OUTPUT
)
print("removing lora")

View File

@@ -45,9 +45,7 @@ def test_minicpm(model) -> None:
]
max_tokens = 5
with VllmRunner(model,
max_model_len=512,
gpu_memory_utilization=0.7) as runner:
with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner:
runner.generate_greedy(example_prompts, max_tokens)
@@ -56,19 +54,12 @@ def test_whisper(model) -> None:
prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
with VllmRunner(model,
max_model_len=448,
max_num_seqs=5,
dtype="bfloat16",
block_size=128,
gpu_memory_utilization=0.9) as runner:
outputs = runner.generate(prompts=prompts,
audios=audios,
sampling_params=sampling_params)
with VllmRunner(
model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", block_size=128, gpu_memory_utilization=0.9
) as runner:
outputs = runner.generate(prompts=prompts, audios=audios, sampling_params=sampling_params)
assert outputs is not None, "Generated outputs should not be None."
assert len(outputs) > 0, "Generated outputs should not be empty."

View File

@@ -39,59 +39,56 @@ def test_models_with_multistream_overlap_shared_expert(
max_tokens: int,
) -> None:
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={
"multistream_overlap_shared_expert": True,
},
quantization="ascend",
model,
max_model_len=1024,
enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={
"multistream_overlap_shared_expert": True,
},
quantization="ascend",
) as runner:
vllm_moe_ms_eager_outputs = runner.model.generate(
prompts, sampling_params)
vllm_moe_ms_eager_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={
"multistream_overlap_shared_expert": True,
},
quantization="ascend",
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={
"multistream_overlap_shared_expert": True,
},
quantization="ascend",
) as runner:
vllm_moe_ms_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
vllm_moe_ms_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32],
quantization="ascend",
model,
max_model_len=1024,
enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32],
quantization="ascend",
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_moe_ms_eager_outputs_list = []
for output in vllm_moe_ms_eager_outputs:
vllm_moe_ms_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_moe_ms_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_moe_ms_aclgraph_outputs_list = []
for output in vllm_moe_ms_aclgraph_outputs:
vllm_moe_ms_aclgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_moe_ms_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -19,6 +19,7 @@ from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
# fmt: off
def test_qwen3_w8a8_quant():
max_tokens = 5
example_prompts = [
@@ -29,6 +30,7 @@ def test_qwen3_w8a8_quant():
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)]
# fmt: on
with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A8",
@@ -47,7 +49,7 @@ def test_qwen3_w8a8_quant():
name_1="vllm_quant_w8a8_outputs",
)
# fmt: off
def test_qwen3_dense_w8a16():
max_tokens = 5
example_prompts = [
@@ -58,6 +60,7 @@ def test_qwen3_dense_w8a16():
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)]
# fmt: on
with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A16",

View File

@@ -1,8 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
from vllm import SamplingParams
from vllm.lora.request import LoRARequest
from unittest.mock import patch
from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import enable_custom_op
@@ -27,16 +28,11 @@ LORA_TEST_EXPECTED = [
def format_chatml_messages(prompt: str):
return [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
},
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
]
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
def test_multi_loras_with_tp_sync():
lora_name_id_map = {}
@@ -102,9 +98,7 @@ def test_multi_loras_with_tp_sync():
outputs = llm.chat(
[messages],
sampling_params,
chat_template_kwargs={
"enable_thinking": False
}, # for those loras, ensure enable_thinking=False
chat_template_kwargs={"enable_thinking": False}, # for those loras, ensure enable_thinking=False
lora_request=lora_request,
use_tqdm=False,
)
@@ -113,15 +107,13 @@ def test_multi_loras_with_tp_sync():
def reload_lora(name: str):
"""
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
for dynamic lora loading and unloading
"""
remove_lora_response = llm.llm_engine.remove_lora(
lora_id=lora_name_id_map[name])
remove_lora_response = llm.llm_engine.remove_lora(lora_id=lora_name_id_map[name])
add_lora_response = llm.llm_engine.add_lora(
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
add_lora_response = llm.llm_engine.add_lora(make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
print(f"{remove_lora_response=}, {add_lora_response=}")
@@ -131,7 +123,6 @@ def test_multi_loras_with_tp_sync():
assert outputs == expected
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output, prompt)

View File

@@ -25,15 +25,11 @@ def test_qwen3_topk() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as runner:
with VllmRunner(
"Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
) as runner:
runner.generate(example_prompts, sampling_params)
@@ -42,29 +38,25 @@ def test_qwen3_prompt_logprobs() -> None:
"Hello, my name is",
]
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as runner:
runner.generate_greedy_logprobs(example_prompts,
max_tokens=5,
num_logprobs=1)
with VllmRunner(
"Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
) as runner:
runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1)
def test_qwen3_exponential_overlap() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=1.0,
top_k=50,
top_p=0.9)
sampling_params = SamplingParams(max_tokens=5, temperature=1.0, top_k=50, top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
additional_config={
"enable_async_exponential": True,
}) as runner:
with VllmRunner(
"Qwen/Qwen3-0.6B",
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
additional_config={
"enable_async_exponential": True,
},
) as runner:
runner.generate(example_prompts, sampling_params)

View File

@@ -20,6 +20,7 @@
Run `pytest tests/test_offline_inference.py`.
"""
import os
from unittest.mock import patch
@@ -44,11 +45,13 @@ def test_multimodal_vl(vl_config):
images = [image] * len(img_questions)
prompts = vl_config["prompt_fn"](img_questions)
with VllmRunner(vl_config["model"],
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
limit_mm_per_prompt={"image": 1}) as vllm_model:
with VllmRunner(
vl_config["model"],
mm_processor_kwargs=vl_config["mm_processor_kwargs"],
max_model_len=8192,
cudagraph_capture_sizes=[1, 2, 4, 8],
limit_mm_per_prompt={"image": 1},
) as vllm_model:
outputs = vllm_model.generate_greedy(
prompts=prompts,
images=images,
@@ -63,35 +66,30 @@ def test_multimodal_vl(vl_config):
@patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"})
def test_multimodal_audio():
audio_prompt = "".join([
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
for idx in range(2)
])
audio_prompt = "".join([f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(2)])
question = "What sport and what nursery rhyme are referenced?"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
mm_data = {
"audio": [
asset.audio_and_sample_rate for asset in
[AudioAsset("mary_had_lamb"),
AudioAsset("winning_call")]
]
"audio": [asset.audio_and_sample_rate for asset in [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]]
}
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct",
max_model_len=4096,
max_num_seqs=5,
dtype="bfloat16",
limit_mm_per_prompt={"audio": 2},
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.9) as runner:
with VllmRunner(
"Qwen/Qwen2-Audio-7B-Instruct",
max_model_len=4096,
max_num_seqs=5,
dtype="bfloat16",
limit_mm_per_prompt={"audio": 2},
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.9,
) as runner:
outputs = runner.generate(inputs, sampling_params=sampling_params)
assert outputs is not None, "Generated outputs should not be None."

View File

@@ -20,13 +20,14 @@ Compare the outputs of vLLM with and without xlite.
Run `pytest tests/e2e/singlecard/test_xlite.py`.
"""
# ruff: noqa: E501
import os
import pytest
from vllm import SamplingParams
from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase,
gen_and_valid)
from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
@@ -35,9 +36,9 @@ CASE_DECODE_ONLY = LLMTestCase(
prompts=PROMPTS_SHORT,
golden_answers=[
"Hello, my name is Lina. I'm a 22-year-old student from China.",
'The president of the United States is the same as the president of the United Nations. This is because the president',
'The capital of France is Paris. The capital of France is also the capital of the French Republic.',
'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
"The president of the United States is the same as the president of the United Nations. This is because the president",
"The capital of France is Paris. The capital of France is also the capital of the French Republic.",
"The future of AI is not just a technological challenge but a profound transformation of how we live, work",
],
sampling_params=SamplingParams(
max_tokens=15,
@@ -45,19 +46,22 @@ CASE_DECODE_ONLY = LLMTestCase(
top_p=1.0,
top_k=0,
n=1,
))
),
)
CASE_FULL = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=[
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
],
golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and"
" the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
" Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
],
sampling_params=SamplingParams(
max_tokens=32,
@@ -65,27 +69,25 @@ CASE_FULL = LLMTestCase(
top_p=1.0,
top_k=0,
n=1,
))
),
)
@pytest.mark.skip(
reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
@pytest.mark.skip(reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
@pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY])
def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
runner_kwargs = {
"model_name": cur_case.model,
"max_model_len": 1024,
"block_size": 128,
"additional_config": {
"xlite_graph_config": {
"enabled": True
}
},
"additional_config": {"xlite_graph_config": {"enabled": True}},
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -94,14 +96,11 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
"model_name": cur_case.model,
"max_model_len": 1024,
"block_size": 128,
"additional_config": {
"xlite_graph_config": {
"enabled": True,
"full_mode": True
}
},
"additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)
gen_and_valid(
runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)

View File

@@ -1,5 +1,4 @@
from dataclasses import dataclass, field
from typing import Optional
from vllm import SamplingParams
@@ -7,37 +6,44 @@ from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
PROMPTS_SHORT = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# NOTE: Randomly fill the prompt with the requested amount for
# the specified capture shape to prevent accuracy issues caused by padding
PROMPTS_LONG = [
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
),
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
'independently and uniformly at random on the perimeter of $ABCD$.'
'If the expected value of the area of triangle $\\triangle AXY$'
'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
'integers $m$ and $n$, compute $m+n$.'),
('Solve the following math problem step by step.'
'The last line of your response should be of the form Answer: '
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
'and $x^2 + cx + b = 0$ also have a common real root.'
'Compute the sum $a + b + c$.')
(
"Solve the following math problem step by step."
"The last line of your response should be of the form Answer: "
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
"In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$"
"be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,"
"$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$."
"If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,"
"where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$."
),
(
"Solve the following math problem step by step."
"The last line of your response should be of the form Answer: "
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
"Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen"
"independently and uniformly at random on the perimeter of $ABCD$."
"If the expected value of the area of triangle $\\triangle AXY$"
"can be expressed as $\\frac{m}{n}$, for relatively prime positive"
"integers $m$ and $n$, compute $m+n$."
),
(
"Solve the following math problem step by step."
"The last line of your response should be of the form Answer: "
"$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
"Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$"
"and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$"
"and $x^2 + cx + b = 0$ also have a common real root."
"Compute the sum $a + b + c$."
),
]
@@ -46,7 +52,7 @@ class LLMTestCase:
model: str
prompts: list[str]
golden_answers: list[str]
quantization: Optional[str] = None
quantization: str | None = None
sampling_params: SamplingParams = field(
default_factory=lambda: SamplingParams(
max_tokens=32,
@@ -54,14 +60,13 @@ class LLMTestCase:
top_p=1.0,
top_k=0,
n=1,
))
)
)
def gen_and_valid(runner_kwargs: dict, prompts: list[str],
sampling_params: SamplingParams, golden_answers: list[str]):
def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]):
with VllmRunner(**runner_kwargs) as runner:
vllm_aclgraph_outputs = runner.model.generate(
prompts=prompts, sampling_params=sampling_params)
vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params)
outputs_gen = []
for output in vllm_aclgraph_outputs:
outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))