[Lint]Style: Convert test/ to ruff format(Batch #5) (#6747)

### What this PR does / why we need it?
| File Path |
| :--- |
| `tests/e2e/singlecard/compile/backend.py` |
| `tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py` |
| `tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py` |
| `tests/e2e/singlecard/compile/test_norm_quant_fusion.py` |
| `tests/e2e/singlecard/model_runner_v2/test_basic.py` |
| `tests/e2e/singlecard/test_aclgraph_accuracy.py` |
| `tests/e2e/singlecard/test_aclgraph_batch_invariant.py` |
| `tests/e2e/singlecard/test_aclgraph_mem.py` |
| `tests/e2e/singlecard/test_async_scheduling.py` |
| `tests/e2e/singlecard/test_auto_fit_max_mode_len.py` |
| `tests/e2e/singlecard/test_batch_invariant.py` |
| `tests/e2e/singlecard/test_camem.py` |
| `tests/e2e/singlecard/test_completion_with_prompt_embeds.py` |
| `tests/e2e/singlecard/test_cpu_offloading.py` |
| `tests/e2e/singlecard/test_guided_decoding.py` |
| `tests/e2e/singlecard/test_ilama_lora.py` |
| `tests/e2e/singlecard/test_llama32_lora.py` |
| `tests/e2e/singlecard/test_models.py` |
| `tests/e2e/singlecard/test_multistream_overlap_shared_expert.py` |
| `tests/e2e/singlecard/test_quantization.py` |
| `tests/e2e/singlecard/test_qwen3_multi_loras.py` |
| `tests/e2e/singlecard/test_sampler.py` |
| `tests/e2e/singlecard/test_vlm.py` |
| `tests/e2e/singlecard/test_xlite.py` |
| `tests/e2e/singlecard/utils.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
SILONG ZENG
2026-02-24 15:50:00 +08:00
committed by GitHub
parent 747484cb64
commit 62ea664aa7
26 changed files with 859 additions and 1052 deletions

View File

@@ -46,11 +46,41 @@ plugins.md024.allow_different_nesting = true # no-duplicate-headers
plugins.md029.enabled = false # ol-prefix plugins.md029.enabled = false # ol-prefix
[tool.ruff] [tool.ruff]
# TODO: according to PEP8, there should be 80 characters per line # TODO: according to PEP8, there should be 120 characters per line
line-length = 120 line-length = 120
# Folder to be modified # Folder to be modified
exclude = [ exclude = [
"tests/**", # Batch (1)
"tests/e2e/__init__.py",
"tests/e2e/310p/",
"tests/e2e/conftest.py",
"tests/e2e/doctests/",
"tests/e2e/model_utils.py",
"tests/e2e/models/",
"tests/e2e/multicard/2-cards/",
# Batch (2)
"tests/e2e/multicard/4-cards/",
"tests/e2e/nightly/multi_node/",
# Batch (3)
"tests/e2e/nightly/single_node/models/",
# Batch (4)
"tests/e2e/nightly/single_node/ops/",
# Batch (5)
# "tests/e2e/singlecard/",
# Batch (6)
"tests/e2e/nightly/single_node/ops/singlecard_ops/triton/",
"tests/e2e/singlecard/pooling/",
"tests/e2e/singlecard/spec_decode/",
"tests/e2e/utils.py",
"tests/e2e/vllm_interface/",
"tests/e2e/weekly/",
"tests/ut/",
] ]
[tool.ruff.lint] [tool.ruff.lint]

View File

@@ -14,8 +14,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
from collections.abc import Callable, Sequence
from copy import deepcopy from copy import deepcopy
from typing import Any, Callable, List, Optional, Sequence from typing import Any
import torch.fx as fx import torch.fx as fx
from torch._inductor.decomposition import select_decomp_table from torch._inductor.decomposition import select_decomp_table
@@ -37,7 +38,7 @@ class TestBackend:
records the FX graph before and after the transformation. records the FX graph before and after the transformation.
""" """
def __init__(self, custom_passes: Optional[List[Any]] = None): def __init__(self, custom_passes: list[Any] | None = None):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
compile_config = vllm_config.compilation_config compile_config = vllm_config.compilation_config
self.inductor_config = compile_config.inductor_compile_config self.inductor_config = compile_config.inductor_compile_config
@@ -48,9 +49,7 @@ class TestBackend:
self.graph_pre_pass = None self.graph_pre_pass = None
self.graph_post_pass = None self.graph_post_pass = None
def post_pass(self, def post_pass(self, graph: fx.Graph, runtime_shape: int | None = None) -> fx.Graph:
graph: fx.Graph,
runtime_shape: int | None = None) -> fx.Graph:
""" """
Apply custom graph transformation passes. Apply custom graph transformation passes.
""" """
@@ -62,13 +61,13 @@ class TestBackend:
return graph return graph
def compile( def compile(
self, self,
graph: fx.GraphModule, graph: fx.GraphModule,
example_inputs: list[Any], example_inputs: list[Any],
compiler_config: dict[str, Any], compiler_config: dict[str, Any],
runtime_shape: Optional[int] = None, runtime_shape: int | None = None,
key: Optional[str] = None key: str | None = None,
) -> tuple[Optional[Callable], Optional[Any]]: ) -> tuple[Callable | None, Any | None]:
""" """
Compile the FX graph using vLLM's Ascend compiler interface. Compile the FX graph using vLLM's Ascend compiler interface.
Wraps the post-pass logic into the inner_compile callback. Wraps the post-pass logic into the inner_compile callback.
@@ -87,8 +86,7 @@ class TestBackend:
) )
return compiled_fn, None return compiled_fn, None
def __call__(self, gm: fx.GraphModule, def __call__(self, gm: fx.GraphModule, example_inputs: list[Any] | None):
example_inputs: Optional[List[Any]]):
""" """
Make the backend callable by torch.compile(). Make the backend callable by torch.compile().
Returns a compiled executable function. Returns a compiled executable function.
@@ -103,17 +101,11 @@ class TestBackend:
) )
return compiled_fn return compiled_fn
def find_nodes_by_target(self, graph: fx.GraphModule, def find_nodes_by_target(self, graph: fx.GraphModule, target: OpOverload) -> list[fx.Node]:
target: OpOverload) -> List[fx.Node]:
"""Helper to find all FX nodes that call a specific operator.""" """Helper to find all FX nodes that call a specific operator."""
return [ return [node for node in graph.graph.nodes if hasattr(node, "target") and node.target == target]
node for node in graph.graph.nodes
if hasattr(node, 'target') and node.target == target
]
def check_before_ops(self, def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced: bool = True):
ops: Sequence[OpOverload],
fully_replaced: bool = True):
""" """
Verify that the original (unfused) operators exist before the pass Verify that the original (unfused) operators exist before the pass
and are fully removed afterward (if fully_replaced=True). and are fully removed afterward (if fully_replaced=True).

View File

@@ -215,6 +215,7 @@ def register_pattern_safe(pattern_class, vllm_config, eps, pattern_key):
try: try:
# Import the required pass class # Import the required pass class
from torch._inductor.pattern_matcher import PatternMatcherPass from torch._inductor.pattern_matcher import PatternMatcherPass
pm_pass = PatternMatcherPass() pm_pass = PatternMatcherPass()
pattern.register(pm_pass) pattern.register(pm_pass)
_registered_patterns.add(pattern_key) _registered_patterns.add(pattern_key)
@@ -243,7 +244,7 @@ def test_rmsnorm_quant_fusion(
sp_enable: bool, sp_enable: bool,
): ):
# Check if fusion operator is available # Check if fusion operator is available
if not hasattr(torch.ops.npu, 'npu_add_rms_norm_quant'): if not hasattr(torch.ops.npu, "npu_add_rms_norm_quant"):
pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test") pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test")
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
@@ -266,7 +267,7 @@ def test_rmsnorm_quant_fusion(
if not enable_custom_op(): if not enable_custom_op():
pytest.skip("Custom ops not available, skipping bias test") pytest.skip("Custom ops not available, skipping bias test")
# Check if the bias operator exists # Check if the bias operator exists
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'): if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test") pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test")
if sp_enable: if sp_enable:
model = ModelSPWithBias(hidden_size, dtype, eps, device="npu") model = ModelSPWithBias(hidden_size, dtype, eps, device="npu")
@@ -281,13 +282,11 @@ def test_rmsnorm_quant_fusion(
else: else:
# The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching # The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching
# so we need to skip if it's not available # so we need to skip if it's not available
if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'): if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test") pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test")
if sp_enable: if sp_enable:
model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu") model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
register_pattern_safe( register_pattern_safe(AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern")
AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern"
)
else: else:
model = ModelWithoutBias(hidden_size, dtype, eps, device="npu") model = ModelWithoutBias(hidden_size, dtype, eps, device="npu")
register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern") register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern")
@@ -302,5 +301,9 @@ def test_rmsnorm_quant_fusion(
compiled_out, compiled_res = compiled_model(x) compiled_out, compiled_res = compiled_model(x)
# Verify output shapes are correct # Verify output shapes are correct
assert compiled_out.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}" assert compiled_out.shape == (num_tokens, hidden_size), (
assert compiled_res.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}" f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
)
assert compiled_res.shape == (num_tokens, hidden_size), (
f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
)

View File

@@ -201,6 +201,7 @@ def test_rmsnorm_quant_fusion(
vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps
) )
from torch._inductor.pattern_matcher import PatternMatcherPass from torch._inductor.pattern_matcher import PatternMatcherPass
pm_pass = PatternMatcherPass() pm_pass = PatternMatcherPass()
fusion_pattern.register(pm_pass) fusion_pattern.register(pm_pass)
model = model.to("npu") model = model.to("npu")

View File

@@ -14,25 +14,20 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
from typing import List
import pytest import pytest
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch_npu
import vllm.config import vllm.config
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
init_distributed_environment)
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
import vllm_ascend.ops.register_custom_ops # noqa import vllm_ascend.ops.register_custom_ops # noqa
from tests.e2e.singlecard.compile.backend import TestBackend from tests.e2e.singlecard.compile.backend import TestBackend
from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \ from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
AddRMSNormQuantFusionPass from vllm_ascend.utils import enable_custom_op, vllm_version_is
from vllm_ascend.utils import enable_custom_op
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.15.0"): if vllm_version_is("0.15.0"):
from vllm.compilation.fx_utils import OpOverload # type: ignore from vllm.compilation.fx_utils import OpOverload # type: ignore
@@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config):
"""Get or create backend with fusion passes (cached to avoid duplicate pattern registration).""" """Get or create backend with fusion passes (cached to avoid duplicate pattern registration)."""
global _backend_cache global _backend_cache
if _backend_cache is None: if _backend_cache is None:
_backend_cache = TestBackend(custom_passes=[ _backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)])
AddRMSNormQuantFusionPass(vllm_config=vllm_config)
])
return _backend_cache return _backend_cache
class TestModelWithoutBias(nn.Module): class TestModelWithoutBias(nn.Module):
""" """
A minimal test model that simulates the pattern: A minimal test model that simulates the pattern:
AddRMSNorm → Quantization (without bias) AddRMSNorm → Quantization (without bias)
""" """
def __init__(self, def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.eps = eps self.eps = eps
self.rms_norm_weight = nn.Parameter( self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
dtype=dtype, self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
def forward(self, x): def forward(self, x):
""" """
@@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module):
residual = torch.zeros_like(x) residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps) x, residual, self.rms_norm_weight, None, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output, quantized_output = torch.ops.vllm.quantize(
self.quant_scale, norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
self.quant_scale_reciprocal, )
self.quant_offset)
return quantized_output, new_residual return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]: def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion.""" """Return the list of expected operators BEFORE fusion."""
return [ return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
def ops_in_model_after(self) -> List[OpOverload]: def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion.""" """Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default] return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module):
AddRMSNorm → Add Bias → Quantization (with bias) AddRMSNorm → Add Bias → Quantization (with bias)
""" """
def __init__(self, def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.eps = eps self.eps = eps
self.rms_norm_weight = nn.Parameter( self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device)) self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
dtype=dtype, self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
def forward(self, x): def forward(self, x):
""" """
@@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module):
residual = torch.zeros_like(x) residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps) x, residual, self.rms_norm_weight, self.bias, self.eps
)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias, quantized_output = torch.ops.vllm.quantize(
self.quant_scale, norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
self.quant_scale_reciprocal, )
self.quant_offset)
return quantized_output, new_residual return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]: def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion.""" """Return the list of expected operators BEFORE fusion."""
return [ return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.quantize.default
]
def ops_in_model_after(self) -> List[OpOverload]: def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion.""" """Return the list of expected operators AFTER successful fusion."""
return [torch.ops.npu.npu_add_rms_norm_quant.default] return [torch.ops.npu.npu_add_rms_norm_quant.default]
@@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module):
AddRMSNorm → maybe_allgather → Quantization (without bias) AddRMSNorm → maybe_allgather → Quantization (without bias)
""" """
def __init__(self, def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.eps = eps self.eps = eps
self.rms_norm_weight = nn.Parameter( self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
dtype=dtype, self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
def forward(self, x): def forward(self, x):
""" """
@@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module):
residual = torch.zeros_like(x) residual = torch.zeros_like(x)
norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, None, self.eps) x, residual, self.rms_norm_weight, None, self.eps
)
norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad( norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True)
norm_output, True)
quantized_output = torch.ops.vllm.quantize(norm_output, quantized_output = torch.ops.vllm.quantize(
self.quant_scale, norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
self.quant_scale_reciprocal, )
self.quant_offset)
return quantized_output, new_residual return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]: def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion.""" """Return the list of expected operators BEFORE fusion."""
return [ return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default torch.ops.vllm.quantize.default,
] ]
def ops_in_model_after(self) -> List[OpOverload]: def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion.""" """Return the list of expected operators AFTER successful fusion."""
return [ return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
class TestModelSPWithBias(nn.Module): class TestModelSPWithBias(nn.Module):
@@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module):
AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias) AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias)
""" """
def __init__(self, def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
hidden_size: int,
dtype: torch.dtype,
eps: float = 1e-6,
device="npu"):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.eps = eps self.eps = eps
self.rms_norm_weight = nn.Parameter( self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
torch.randn(hidden_size, device=device))
self.bias = nn.Parameter(torch.randn(hidden_size, device=device)) self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
self.quant_scale_reciprocal = torch.ones(hidden_size, self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
dtype=dtype, self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
device=device)
self.quant_offset = torch.zeros(hidden_size,
dtype=dtype,
device=device)
def forward(self, x): def forward(self, x):
""" """
@@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module):
residual = torch.zeros_like(x) residual = torch.zeros_like(x)
norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
x, residual, self.rms_norm_weight, self.bias, self.eps) x, residual, self.rms_norm_weight, self.bias, self.eps
)
norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad( norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True)
norm_output_with_bias, True)
quantized_output = torch.ops.vllm.quantize(norm_output_with_bias, quantized_output = torch.ops.vllm.quantize(
self.quant_scale, norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
self.quant_scale_reciprocal, )
self.quant_offset)
return quantized_output, new_residual return quantized_output, new_residual
def ops_in_model_before(self) -> List[OpOverload]: def ops_in_model_before(self) -> list[OpOverload]:
"""Return the list of expected operators BEFORE fusion.""" """Return the list of expected operators BEFORE fusion."""
return [ return [
torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops._C_ascend.npu_add_rms_norm_bias.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
torch.ops.vllm.quantize.default torch.ops.vllm.quantize.default,
] ]
def ops_in_model_after(self) -> List[OpOverload]: def ops_in_model_after(self) -> list[OpOverload]:
"""Return the list of expected operators AFTER successful fusion.""" """Return the list of expected operators AFTER successful fusion."""
return [ return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
torch.ops.npu.npu_add_rms_norm_quant.default,
torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
]
@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion(
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
with vllm.config.set_current_vllm_config(vllm_config): with vllm.config.set_current_vllm_config(vllm_config):
update_environment_variables({ update_environment_variables(
"RANK": "0", {
"LOCAL_RANK": "0", "RANK": "0",
"WORLD_SIZE": "1", "LOCAL_RANK": "0",
"MASTER_ADDR": "localhost", "WORLD_SIZE": "1",
"MASTER_PORT": "12345", "MASTER_ADDR": "localhost",
}) "MASTER_PORT": "12345",
}
)
init_distributed_environment() init_distributed_environment()
ensure_model_parallel_initialized(1, 1) ensure_model_parallel_initialized(1, 1)
with vllm.config.set_current_vllm_config(vllm_config): with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config):
with set_ascend_forward_context(None, vllm_config): backend = get_or_create_backend(vllm_config)
backend = get_or_create_backend(vllm_config) if use_bias:
if use_bias: if not enable_custom_op():
if not enable_custom_op(): return
return if sp_enable:
if sp_enable: model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu")
model = TestModelSPWithBias(hidden_size,
dtype,
eps,
device="npu")
else:
model = TestModelWithBias(hidden_size,
dtype,
eps,
device="npu")
else: else:
if sp_enable: model = TestModelWithBias(hidden_size, dtype, eps, device="npu")
model = TestModelSPWithoutBias(hidden_size, else:
dtype, if sp_enable:
eps, model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
device="npu") else:
else: model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu")
model = TestModelWithoutBias(hidden_size, model = model.to("npu")
dtype,
eps,
device="npu")
model = model.to("npu")
x = torch.rand(num_tokens, x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False)
hidden_size,
device="npu",
dtype=dtype,
requires_grad=False)
result_unfused = model(x) result_unfused = model(x)
print("Unfused result:", [t.shape for t in result_unfused]) print("Unfused result:", [t.shape for t in result_unfused])
model_fused = torch.compile(model, backend=backend) model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x) result_fused = model_fused(x)
print("Fused result:", [t.shape for t in result_fused]) print("Fused result:", [t.shape for t in result_fused])
print("=== Checking operator fusion ===") print("=== Checking operator fusion ===")
backend.check_before_ops(model.ops_in_model_before(), backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable)
fully_replaced=not sp_enable) backend.check_after_ops(model.ops_in_model_after())
backend.check_after_ops(model.ops_in_model_after())

View File

@@ -47,9 +47,9 @@ def test_qwen3_dense_eager_mode(
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=1024, max_model_len=1024,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
) as runner: ) as runner:
runner.model.generate(prompts, sampling_params) runner.model.generate(prompts, sampling_params)
@@ -74,14 +74,14 @@ def test_egale_spec_decoding(
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=1024, max_model_len=1024,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
async_scheduling=True, async_scheduling=True,
speculative_config={ speculative_config={
"model": eagle_model, "model": eagle_model,
"method": "eagle", "method": "eagle",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
}, },
) as runner: ) as runner:
runner.model.generate(prompts, sampling_params) runner.model.generate(prompts, sampling_params)

View File

@@ -15,20 +15,22 @@
# limitations under the License. # limitations under the License.
# #
import pytest # ruff: noqa: E501
import os import os
from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT, import pytest
LLMTestCase, gen_and_valid)
from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
CASE_QWEN_ACLGRAPH = LLMTestCase( CASE_QWEN_ACLGRAPH = LLMTestCase(
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_SHORT, prompts=PROMPTS_SHORT,
golden_answers=[ golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of', " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and' " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
], ],
) )
@@ -37,10 +39,10 @@ CASE_DS_ACLGRAPH = LLMTestCase(
quantization="ascend", quantization="ascend",
prompts=PROMPTS_SHORT, prompts=PROMPTS_SHORT,
golden_answers=[ golden_answers=[
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2', "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the', " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art', " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
' here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of' " here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of",
], ],
) )
@@ -49,9 +51,9 @@ CASE_QWEN_FULL = LLMTestCase(
prompts=PROMPTS_SHORT, prompts=PROMPTS_SHORT,
golden_answers=[ golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of', " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and' " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
], ],
) )
@@ -60,10 +62,10 @@ CASE_DS_FULL = LLMTestCase(
quantization="ascend", quantization="ascend",
prompts=PROMPTS_SHORT, prompts=PROMPTS_SHORT,
golden_answers=[ golden_answers=[
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2', "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the', " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art', " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
' here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of' " here, and its not what you think.\nThe future of AI is here, and its not what you think.\nThe future of",
], ],
) )
@@ -71,10 +73,11 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_LONG, prompts=PROMPTS_LONG,
golden_answers=[ golden_answers=[
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the', " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over", " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can' " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
]) ],
)
CASE_DS_FULL_DECODE_ONLY = LLMTestCase( CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
model="vllm-ascend/DeepSeek-V2-Lite-W8A8", model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
@@ -83,26 +86,31 @@ CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
golden_answers=[ golden_answers=[
"\n\nSelect an assignment template", "\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use", "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations" "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
]) ],
)
CASE_QWEN_EX = LLMTestCase( CASE_QWEN_EX = LLMTestCase(
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_LONG, prompts=PROMPTS_LONG,
golden_answers=[ golden_answers=[
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the', " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over", " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can' " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
]) ],
)
CASE_DS_EX = LLMTestCase(
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
quantization="ascend",
prompts=PROMPTS_LONG,
golden_answers=[
"\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
],
)
CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
quantization="ascend",
prompts=PROMPTS_LONG,
golden_answers=[
"\n\nSelect an assignment template",
"\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
"\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
])
@pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH]) @pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
def test_piecewise_res_consistency(cur_case: LLMTestCase): def test_piecewise_res_consistency(cur_case: LLMTestCase):
@@ -112,51 +120,48 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
"cudagraph_capture_sizes": [1, 2, 4, 8], "cudagraph_capture_sizes": [1, 2, 4, 8],
"quantization": cur_case.quantization, "quantization": cur_case.quantization,
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL]) @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch): def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
runner_kwargs = { runner_kwargs = {
"model_name": cur_case.model, "model_name": cur_case.model,
"max_model_len": 1024, "max_model_len": 1024,
"compilation_config": { "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"quantization": cur_case.quantization, "quantization": cur_case.quantization,
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY]) @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch): def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
runner_kwargs = { runner_kwargs = {
"model_name": cur_case.model, "model_name": cur_case.model,
"max_model_len": 1024, "max_model_len": 1024,
"compilation_config": { "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"quantization": cur_case.quantization, "quantization": cur_case.quantization,
"additional_config": { "additional_config": {"npugraph_ex_config": {"enable": False}},
"npugraph_ex_config": {
"enable": False
}
},
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX]) @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
@@ -165,20 +170,16 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
"model_name": cur_case.model, "model_name": cur_case.model,
"quantization": cur_case.quantization, "quantization": cur_case.quantization,
"max_model_len": 1024, "max_model_len": 1024,
"compilation_config": { "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
"cudagraph_capture_sizes": [4, 8, 32, 64], "additional_config": {"npugraph_ex_config": {"enable": True}},
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"additional_config": {
"npugraph_ex_config": {
"enable": True
}
},
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
# The accuracy has already been verified in the previous test case. # The accuracy has already been verified in the previous test case.
# This test case is used to check whether the functionality works properly # This test case is used to check whether the functionality works properly
@@ -190,10 +191,7 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
"model_name": cur_case.model, "model_name": cur_case.model,
"quantization": cur_case.quantization, "quantization": cur_case.quantization,
"max_model_len": 1024, "max_model_len": 1024,
"compilation_config": { "compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"},
"cudagraph_capture_sizes": [4, 8],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"additional_config": { "additional_config": {
"npugraph_ex_config": { "npugraph_ex_config": {
"enable": True, "enable": True,
@@ -201,12 +199,14 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
} }
}, },
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
# Check whether the static kernel is properly uninstall # Check whether the static kernel is properly uninstall
ascend_home_path = os.environ["ASCEND_HOME_PATH"] ascend_home_path = os.environ["ASCEND_HOME_PATH"]
static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core') static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
assert not os.path.exists(static_kernel_install_path) assert not os.path.exists(static_kernel_install_path)

View File

@@ -22,6 +22,7 @@ import random
import pytest import pytest
import torch import torch
from vllm import SamplingParams from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
DEFAULT_MODEL = "Qwen/Qwen3-0.6B" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
@@ -69,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
if target_words > 50: if target_words > 50:
# For longer prompts, repeat context # For longer prompts, repeat context
padding_text = ( padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
" This is an interesting topic that deserves more explanation. " *
(target_words // 50))
base_prompt = base_prompt + padding_text base_prompt = base_prompt + padding_text
return base_prompt return base_prompt
@@ -107,8 +106,7 @@ def _extract_step_logprobs(generate_output):
@pytest.mark.timeout(1000) @pytest.mark.timeout(1000)
def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle( def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
monkeypatch: pytest.MonkeyPatch):
""" """
Ensures that the same request (the 'needle' prompt) yields identical output Ensures that the same request (the 'needle' prompt) yields identical output
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64), whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -162,20 +160,16 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
needle_prompt = "There once was a " needle_prompt = "There once was a "
with VllmRunner( with VllmRunner(
model_name=model, model_name=model,
max_num_seqs=max_batch_size, max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util, gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len, max_model_len=max_model_len,
dtype="bfloat16", dtype="bfloat16",
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False, enable_prefix_caching=False,
distributed_executor_backend="mp", distributed_executor_backend="mp",
compilation_config={ compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
}
) as vllm_model: ) as vllm_model:
# Baseline generation for the needle prompt alone. # Baseline generation for the needle prompt alone.
baseline_out = vllm_model.generate([needle_prompt], sampling) baseline_out = vllm_model.generate([needle_prompt], sampling)
assert len(baseline_out) == 1 assert len(baseline_out) == 1
@@ -194,8 +188,7 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
if i == needle_pos: if i == needle_pos:
prompts.append(needle_prompt) prompts.append(needle_prompt)
else: else:
prompts.append( prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
_random_prompt(min_random_prompt, max_random_prompt))
# Generate with the larger-batch engine # Generate with the larger-batch engine
outputs = vllm_model.generate(prompts, sampling) outputs = vllm_model.generate(prompts, sampling)
@@ -204,24 +197,23 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
text = needle_output[0] text = needle_output[0]
if text != baseline_text: if text != baseline_text:
print( print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
mismatches += 1 mismatches += 1
passes = num_trials - mismatches passes = num_trials - mismatches
# Dump how many passed vs failed # Dump how many passed vs failed
print(f"[determinism] total={num_trials}, passed={passes}, " print(
f"failed={mismatches}, max_batch_size={max_batch_size}") f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
)
if mismatches > 0: if mismatches > 0:
pytest.fail( pytest.fail(
f"Nondeterministic outputs detected: {mismatches} failed out " f"Nondeterministic outputs detected: {mismatches} failed out "
f"of {num_trials} trials (max_batch_size={max_batch_size}).") f"of {num_trials} trials (max_batch_size={max_batch_size})."
)
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
monkeypatch: pytest.MonkeyPatch):
seed = int(os.getenv("VLLM_TEST_SEED", "12345")) seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed) random.seed(seed)
model_name = DEFAULT_MODEL model_name = DEFAULT_MODEL
@@ -235,24 +227,19 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
if disable_custom_ar: if disable_custom_ar:
print(f"\n{'=' * 80}") print(f"\n{'=' * 80}")
print( print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
)
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
with VllmRunner( with VllmRunner(
model_name=model_name, model_name=model_name,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
enable_prefix_caching=False, enable_prefix_caching=False,
max_num_seqs=32, max_num_seqs=32,
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", dtype="bfloat16",
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
distributed_executor_backend="mp", distributed_executor_backend="mp",
compilation_config={ compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [1, 32, 64]
}
) as vllm_model: ) as vllm_model:
# Use more realistic prompts for better token generation # Use more realistic prompts for better token generation
prompts = [_random_prompt(10, 50) for i in range(32)] prompts = [_random_prompt(10, 50) for i in range(32)]
@@ -273,16 +260,13 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
bs1_logprobs_per_prompt = [] bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = [] bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts): for idx, p in enumerate(prompts):
print( print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False) outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
assert len(outs) == 1 assert len(outs) == 1
# print(outs) # print(outs)
step_logprobs, token_ids = _extract_step_logprobs(outs[0]) step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs) bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids) bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -304,108 +288,91 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o) step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs) bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids) bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
failed_prompts = [] failed_prompts = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip( zip(
bs1_logprobs_per_prompt, bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt, bsN_logprobs_per_prompt,
bs1_tokens_per_prompt, bs1_tokens_per_prompt,
bsN_tokens_per_prompt, bsN_tokens_per_prompt,
)): )
):
if len(logprobs_bs1) != len(logprobs_bsN): if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
f"vs {len(logprobs_bsN)} (BS=N)") failed_prompts.append(
failed_prompts.append({ {
"prompt_idx": i, "prompt_idx": i,
"step": "all", "step": "all",
"reason": reason, "reason": reason,
"prompt_preview": prompts[i][:100], "prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1, "bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN, "bsN_tokens": tokens_bsN,
}) }
)
continue continue
# Check if tokens match first # Check if tokens match first
if tokens_bs1 != tokens_bsN: if tokens_bs1 != tokens_bsN:
failed_prompts.append({ failed_prompts.append(
"prompt_idx": {
i, "prompt_idx": i,
"step": "step": "sampling",
"sampling", "reason": "Different tokens sampled",
"reason": "prompt_preview": prompts[i][:100],
"Different tokens sampled", "bs1_tokens": tokens_bs1,
"prompt_preview": "bsN_tokens": tokens_bsN,
prompts[i][:100], "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bs1_tokens": "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
tokens_bs1, }
"bsN_tokens": )
tokens_bsN,
"bs1_all_logprobs":
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs":
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
})
continue continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape: if a.shape != b.shape:
failed_prompts.append({ failed_prompts.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
if not torch.equal(a, b): if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item() max_diff = torch.abs(a - b).max().item()
# Print which token failed # Print which token failed
print( print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
)
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}") print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}") print(f" BS=N logprob: {b.tolist()}")
failed_prompts.append({ failed_prompts.append(
"prompt_idx": {
i, "prompt_idx": i,
"step": "step": t,
t, "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"reason": "prompt_preview": prompts[i][:100],
f"Bitwise mismatch (max_diff={max_diff:.6e})", "bs1_tokens": tokens_bs1,
"prompt_preview": "bsN_tokens": tokens_bsN,
prompts[i][:100], "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bs1_tokens": "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
tokens_bs1, }
"bsN_tokens": )
tokens_bsN,
"bs1_all_logprobs": [
logprobs_bs1[s].tolist()
for s in range(len(logprobs_bs1))
],
"bsN_all_logprobs": [
logprobs_bsN[s].tolist()
for s in range(len(logprobs_bsN))
],
})
break break
# Print summary of all failures # Print summary of all failures
if failed_prompts: if failed_prompts:
print(f"\n{'=' * 80}") print(f"\n{'=' * 80}")
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/" fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
f"{len(prompts)} prompts failed")
print(fail_msg) print(fail_msg)
print(f"{'=' * 80}") print(f"{'=' * 80}")
for fail in failed_prompts: for fail in failed_prompts:
@@ -420,21 +387,18 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f" BS=N tokens: {fail['bsN_tokens']}") print(f" BS=N tokens: {fail['bsN_tokens']}")
if "bs1_all_logprobs" in fail: if "bs1_all_logprobs" in fail:
print( print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
)
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]): for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}") print(f" Step {step_idx}: {logprobs}")
print( print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
)
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]): for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}") print(f" Step {step_idx}: {logprobs}")
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
# Fail the test with summary # Fail the test with summary
msg = (f"Batch invariance violated in {len(failed_prompts)}/" msg = (
f"{len(prompts)} prompts. See output above for details.") f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
)
pytest.fail(msg) pytest.fail(msg)
@@ -446,18 +410,15 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
model = DEFAULT_MODEL model = DEFAULT_MODEL
with VllmRunner( with VllmRunner(
model_name=model, model_name=model,
max_num_seqs=1, max_num_seqs=1,
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
max_model_len=2048, max_model_len=2048,
dtype="float16", dtype="float16",
enable_prefix_caching=False, enable_prefix_caching=False,
compilation_config={ compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
"cudagraph_mode": "FULL_DECODE_ONLY", distributed_executor_backend="mp",
"cudagraph_capture_sizes": [1, 32, 64]
},
distributed_executor_backend="mp",
) as vllm_model: ) as vllm_model:
prompt = "The capital of France is" prompt = "The capital of France is"
sampling_params = SamplingParams( sampling_params = SamplingParams(
@@ -479,11 +440,7 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
def test_aclgraph_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
def test_aclgraph_logprobs_without_batch_invariance_should_fail(
monkeypatch: pytest.MonkeyPatch):
""" """
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN. This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
It DISABLES batch invariance mode and expects to see non-deterministic behavior It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -505,19 +462,15 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
with VllmRunner( with VllmRunner(
model_name=model_name, model_name=model_name,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
enable_prefix_caching=False, enable_prefix_caching=False,
max_num_seqs=32, max_num_seqs=32,
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", dtype="bfloat16",
compilation_config={ compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
"cudagraph_mode": "FULL_DECODE_ONLY", distributed_executor_backend="mp",
"cudagraph_capture_sizes": [1, 32, 64]
},
distributed_executor_backend="mp",
) as vllm_model: ) as vllm_model:
# build ragged prompts to change shapes significantly across BS=1 vs BS=N # build ragged prompts to change shapes significantly across BS=1 vs BS=N
long_min = int(os.getenv("VLLM_MIN_PROMPT", "768")) long_min = int(os.getenv("VLLM_MIN_PROMPT", "768"))
long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048")) long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
@@ -549,16 +502,13 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
bs1_logprobs_per_prompt = [] bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = [] bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts): for idx, p in enumerate(prompts):
print( print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False) outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
assert len(outs) == 1 assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0]) step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs) bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids) bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -579,84 +529,90 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o) step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs) bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids) bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
differences_found = [] differences_found = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip( zip(
bs1_logprobs_per_prompt, bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt, bsN_logprobs_per_prompt,
bs1_tokens_per_prompt, bs1_tokens_per_prompt,
bsN_tokens_per_prompt, bsN_tokens_per_prompt,
)): )
):
if len(logprobs_bs1) != len(logprobs_bsN): if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
f"vs {len(logprobs_bsN)} (BS=N)") differences_found.append(
differences_found.append({ {
"prompt_idx": i, "prompt_idx": i,
"step": "all", "step": "all",
"reason": reason, "reason": reason,
"prompt_preview": prompts[i][:100], "prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1, "bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN, "bsN_tokens": tokens_bsN,
}) }
)
continue continue
# Check if tokens match first # Check if tokens match first
if tokens_bs1 != tokens_bsN: if tokens_bs1 != tokens_bsN:
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": "sampling", "prompt_idx": i,
"reason": "Different tokens sampled", "step": "sampling",
"prompt_preview": prompts[i][:100], "reason": "Different tokens sampled",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
continue continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape: if a.shape != b.shape:
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
if not torch.equal(a, b): if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item() max_diff = torch.abs(a - b).max().item()
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
f"Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}") print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}") print(f" BS=N logprob: {b.tolist()}")
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
# Print summary # Print summary
print(f"\n{'=' * 80}") print(f"\n{'=' * 80}")
if differences_found: if differences_found:
success_msg = ( success_msg = (
f"✓ SUCCESS: Batch invariance is doing something! " f"✓ SUCCESS: Batch invariance is doing something! "
f"Found {len(differences_found)}/{len(prompts)} prompts " f"Found {len(differences_found)}/{len(prompts)} prompts "
f"with differences when batch invariance was DISABLED.") f"with differences when batch invariance was DISABLED."
)
print(success_msg) print(success_msg)
print(f"{'=' * 80}") print(f"{'=' * 80}")
for diff in differences_found: for diff in differences_found:
@@ -676,7 +632,8 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
f"✗ UNEXPECTED: All {len(prompts)} prompts matched " f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
f"between BS=1 and BS=N even with batch invariance DISABLED. " f"between BS=1 and BS=N even with batch invariance DISABLED. "
f"This suggests batch invariance might not be necessary, " f"This suggests batch invariance might not be necessary, "
f"or the test needs more sensitive prompts.") f"or the test needs more sensitive prompts."
)
print(fail_msg) print(fail_msg)
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
pytest.fail(fail_msg) pytest.fail(fail_msg)

View File

@@ -40,7 +40,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
capture_mem_after = multiprocessing.Value("q", -1) # long long capture_mem_after = multiprocessing.Value("q", -1) # long long
def capture_model_wrapper(original_method): def capture_model_wrapper(original_method):
def wrapped(self): def wrapped(self):
mem_before = torch.npu.mem_get_info()[0] # free memory mem_before = torch.npu.mem_get_info()[0] # free memory
result = original_method(self) result = original_method(self)
@@ -55,19 +54,16 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
original_capture = NPUModelRunner.capture_model original_capture = NPUModelRunner.capture_model
with patch.object(NPUModelRunner, with patch.object(NPUModelRunner, "capture_model", new=capture_model_wrapper(original_capture)):
'capture_model',
new=capture_model_wrapper(original_capture)):
prompts = [ prompts = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
] ]
sampling_params = SamplingParams(max_tokens=max_tokens, sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
temperature=0.0)
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
vllm_model = VllmRunner(model, vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend")
max_model_len=1024,
quantization="ascend")
else: else:
vllm_model = VllmRunner(model) vllm_model = VllmRunner(model)
_ = vllm_model.generate(prompts, sampling_params) _ = vllm_model.generate(prompts, sampling_params)
@@ -94,5 +90,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
assert mem_used_by_capture < max_mem_expected, ( assert mem_used_by_capture < max_mem_expected, (
f"capture_model used more memory than expected. " f"capture_model used more memory than expected. "
f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, " f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
f"Expected: < {max_capture_mem_gib:.2f} GiB") f"Expected: < {max_capture_mem_gib:.2f} GiB"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn' )
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

View File

@@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B" MODEL = "Qwen/Qwen3-0.6B"
MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16" MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
first_prompt = ("The following numbers of the sequence " + first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:"
", ".join(str(i) for i in range(10)) + " are:")
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
@@ -31,7 +30,9 @@ default_params = dict(
) )
def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ): def test_without_spec_decoding(
monkeypatch: pytest.MonkeyPatch,
):
"""Test consistency of combos of async scheduling, preemption, """Test consistency of combos of async scheduling, preemption,
uni/multiproc executor, prefill chunking.""" uni/multiproc executor, prefill chunking."""
test_sampling_params: list[dict[str, Any]] = [ test_sampling_params: list[dict[str, Any]] = [
@@ -85,11 +86,11 @@ def run_tests(
# avoid precision errors # avoid precision errors
outputs: list[tuple[str, list, list]] = [] outputs: list[tuple[str, list, list]] = []
for n, ( for n, (
test_preemption, test_preemption,
executor, executor,
async_scheduling, async_scheduling,
spec_config, spec_config,
test_prefill_chunking, test_prefill_chunking,
) in enumerate(test_configs, 1): ) in enumerate(test_configs, 1):
test_str = f"{n}/{len(test_configs)}" test_str = f"{n}/{len(test_configs)}"
test_results = run_test( test_results = run_test(
@@ -105,21 +106,18 @@ def run_tests(
outputs.append(test_results) outputs.append(test_results)
baseline_config, baseline_tests, _ = outputs[0] baseline_config, baseline_tests, _ = outputs[0]
_, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), _, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None))
(None, None, None))
print( print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}"
)
failure = None failure = None
for test_config, test_outputs, test_acceptance_rates in outputs[1:]: for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip( for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip(
baseline_tests, baseline_tests,
baseline_acceptances or repeat(None), baseline_acceptances or repeat(None),
test_outputs, test_outputs,
test_acceptance_rates or repeat(None), test_acceptance_rates or repeat(None),
test_sampling_params, test_sampling_params,
): ):
try: try:
check_outputs_equal( check_outputs_equal(
@@ -129,21 +127,18 @@ def run_tests(
name_1=f"config=[{test_config}], params={params}", name_1=f"config=[{test_config}], params={params}",
) )
if (base_acceptance_rate is not None if base_acceptance_rate is not None and test_acceptance_rate is not None:
and test_acceptance_rate is not None):
if "spec_mml=None" in test_config: if "spec_mml=None" in test_config:
assert (test_acceptance_rate > base_acceptance_rate assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx(
or test_acceptance_rate == pytest.approx( base_acceptance_rate, rel=5e-2
base_acceptance_rate, rel=5e-2)) )
else: else:
# Currently the reported acceptance rate is expected to be # Currently the reported acceptance rate is expected to be
# lower when we sometimes skip drafting altogether. # lower when we sometimes skip drafting altogether.
assert test_acceptance_rate > 0.1 assert test_acceptance_rate > 0.1
print(f"PASSED: config=[{test_config}], params={params}" print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
f" accept_rate={test_acceptance_rate}")
except AssertionError as e: except AssertionError as e:
print(f"FAILED: config=[{test_config}], params={params}" print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
f" accept_rate={test_acceptance_rate}")
if failure is None: if failure is None:
failure = e failure = e
@@ -161,33 +156,35 @@ def run_test(
spec_config: dict[str, Any] | None, spec_config: dict[str, Any] | None,
test_prefill_chunking: bool, test_prefill_chunking: bool,
): ):
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
spec_decoding = spec_config is not None spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = ( cache_arg: dict[str, Any] = (
# Force preemptions # Force preemptions
dict(num_gpu_blocks_override=2) if test_preemption else dict( dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9)
gpu_memory_utilization=0.9)) )
spec_mml = (spec_config or {}).get("max_model_len") spec_mml = (spec_config or {}).get("max_model_len")
test_config = (f"executor={executor}, preemption={test_preemption}, " test_config = (
f"async_sched={async_scheduling}, " f"executor={executor}, preemption={test_preemption}, "
f"chunk_prefill={test_prefill_chunking}, " f"async_sched={async_scheduling}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}") f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
)
print("-" * 80) print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}") print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80) print("-" * 80)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=512, max_model_len=512,
enable_chunked_prefill=test_prefill_chunking, enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking # Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None, max_num_batched_tokens=48 if test_prefill_chunking else None,
enforce_eager=True, enforce_eager=True,
async_scheduling=async_scheduling, async_scheduling=async_scheduling,
distributed_executor_backend=executor, distributed_executor_backend=executor,
dtype="float16", # avoid precision errors dtype="float16", # avoid precision errors
speculative_config=spec_config, speculative_config=spec_config,
disable_log_stats=False, disable_log_stats=False,
**cache_arg, **cache_arg,
) as vllm_model: ) as vllm_model:
results = [] results = []
acceptance_rates: list[float] | None = [] if spec_decoding else None acceptance_rates: list[float] | None = [] if spec_decoding else None
@@ -197,26 +194,23 @@ def run_test(
results.append( results.append(
vllm_model.generate( vllm_model.generate(
example_prompts, example_prompts,
sampling_params=SamplingParams(**default_params, sampling_params=SamplingParams(**default_params, **override_params),
**override_params), )
)) )
metrics_after = vllm_model.model.get_metrics() metrics_after = vllm_model.model.get_metrics()
if acceptance_rates is not None: if acceptance_rates is not None:
acceptance_rate = _get_acceptance_rate(metrics_before, acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
metrics_after)
acceptance_rates.append(acceptance_rate) acceptance_rates.append(acceptance_rate)
print(f"ACCEPTANCE RATE {acceptance_rate}") print(f"ACCEPTANCE RATE {acceptance_rate}")
if test_preemption: if test_preemption:
preemptions = _get_count(metrics_before, metrics_after, preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions")
"vllm:num_preemptions")
assert preemptions > 0, "preemption test had no preemptions" assert preemptions > 0, "preemption test had no preemptions"
if len(results) > 1: if len(results) > 1:
# First check that the different parameter configs # First check that the different parameter configs
# actually result in different output. # actually result in different output.
for other_test_outs, params in zip(results[1:], for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]):
sampling_param_tests[1:]):
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
check_outputs_equal( check_outputs_equal(
outputs_0_lst=results[0][0], outputs_0_lst=results[0][0],

View File

@@ -42,6 +42,7 @@ def new_kv_cache_spec(
attention_chunk_size=attention_chunk_size, attention_chunk_size=attention_chunk_size,
) )
def test_auto_fit_max_model_len(): def test_auto_fit_max_model_len():
"""Test that max_model_len=-1 auto-fits to available NPU memory.""" """Test that max_model_len=-1 auto-fits to available NPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit # Create config with original_max_model_len=-1 to trigger auto-fit
@@ -59,9 +60,7 @@ def test_auto_fit_max_model_len():
# With enough memory, max_model_len stays at the derived max # With enough memory, max_model_len stays at the derived max
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
_kv_cache_configs = get_kv_cache_configs( _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
vllm_config, [kv_cache_specs], [large_available_memory]
)
assert vllm_config.model_config.max_model_len == 1024 assert vllm_config.model_config.max_model_len == 1024
# Reset for next test # Reset for next test
@@ -73,9 +72,7 @@ def test_auto_fit_max_model_len():
# Need memory for at least max_model_len tokens # Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory = mem_per_block_per_layer * 2 * 32 limited_memory = mem_per_block_per_layer * 2 * 32
_kv_cache_configs = get_kv_cache_configs( _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [limited_memory])
vllm_config, [kv_cache_specs], [limited_memory]
)
# Should be reduced to fit in memory # Should be reduced to fit in memory
assert vllm_config.model_config.max_model_len < 1024 assert vllm_config.model_config.max_model_len < 1024
assert vllm_config.model_config.max_model_len > 0 assert vllm_config.model_config.max_model_len > 0
@@ -94,7 +91,5 @@ def test_auto_fit_max_model_len_not_triggered():
} }
# This should work normally without auto-fit # This should work normally without auto-fit
_kv_cache_configs = get_kv_cache_configs( _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32])
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
)
assert vllm_config.model_config.max_model_len == 16 assert vllm_config.model_config.max_model_len == 16

View File

@@ -70,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
if target_words > 50: if target_words > 50:
# For longer prompts, repeat context # For longer prompts, repeat context
padding_text = ( padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
" This is an interesting topic that deserves more explanation. " *
(target_words // 50))
base_prompt = base_prompt + padding_text base_prompt = base_prompt + padding_text
return base_prompt return base_prompt
@@ -83,10 +81,7 @@ def _extract_step_logprobs(request_output):
inner = request_output.outputs[0] inner = request_output.outputs[0]
if hasattr(inner, "logprobs") and inner.logprobs is not None: if hasattr(inner, "logprobs") and inner.logprobs is not None:
t = torch.tensor( t = torch.tensor(
[ [inner.logprobs[i][tid].logprob for i, tid in enumerate(inner.token_ids)],
inner.logprobs[i][tid].logprob
for i, tid in enumerate(inner.token_ids)
],
dtype=torch.float32, dtype=torch.float32,
) )
return t, inner.token_ids return t, inner.token_ids
@@ -95,8 +90,7 @@ def _extract_step_logprobs(request_output):
@pytest.mark.timeout(1000) @pytest.mark.timeout(1000)
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
monkeypatch: pytest.MonkeyPatch):
""" """
Ensures that the same request (the 'needle' prompt) yields identical output Ensures that the same request (the 'needle' prompt) yields identical output
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64), whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -184,8 +178,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
if i == needle_pos: if i == needle_pos:
prompts.append(needle_prompt) prompts.append(needle_prompt)
else: else:
prompts.append( prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
_random_prompt(min_random_prompt, max_random_prompt))
# Generate with the larger-batch engine # Generate with the larger-batch engine
outputs = llm.generate(prompts, sampling) outputs = llm.generate(prompts, sampling)
@@ -196,27 +189,27 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
text = needle_output.outputs[0].text text = needle_output.outputs[0].text
if text != baseline_text: if text != baseline_text:
print( print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
mismatches += 1 mismatches += 1
passes = num_trials - mismatches passes = num_trials - mismatches
# Dump how many passed vs failed # Dump how many passed vs failed
print(f"[determinism] total={num_trials}, passed={passes}, " print(
f"failed={mismatches}, max_batch_size={max_batch_size}") f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
)
if mismatches > 0: if mismatches > 0:
pytest.fail( pytest.fail(
f"Nondeterministic outputs detected: {mismatches} failed out " f"Nondeterministic outputs detected: {mismatches} failed out "
f"of {num_trials} trials (max_batch_size={max_batch_size}).") f"of {num_trials} trials (max_batch_size={max_batch_size})."
)
finally: finally:
del llm del llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
monkeypatch: pytest.MonkeyPatch):
seed = int(os.getenv("VLLM_TEST_SEED", "12345")) seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed) random.seed(seed)
model_name = DEFAULT_MODEL model_name = DEFAULT_MODEL
@@ -230,9 +223,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
if disable_custom_ar: if disable_custom_ar:
print(f"\n{'=' * 80}") print(f"\n{'=' * 80}")
print( print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
)
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
llm = LLM( llm = LLM(
@@ -266,15 +257,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
bs1_logprobs_per_prompt = [] bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = [] bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts): for idx, p in enumerate(prompts):
print( print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
outs = llm.generate([p], sp, use_tqdm=False) outs = llm.generate([p], sp, use_tqdm=False)
assert len(outs) == 1 assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0]) step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs) bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids) bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -296,108 +284,92 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o) step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs) bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids) bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
failed_prompts = [] failed_prompts = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip( zip(
bs1_logprobs_per_prompt, bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt, bsN_logprobs_per_prompt,
bs1_tokens_per_prompt, bs1_tokens_per_prompt,
bsN_tokens_per_prompt, bsN_tokens_per_prompt,
)): )
):
if len(logprobs_bs1) != len(logprobs_bsN): if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
f"vs {len(logprobs_bsN)} (BS=N)") failed_prompts.append(
failed_prompts.append({ {
"prompt_idx": i, "prompt_idx": i,
"step": "all", "step": "all",
"reason": reason, "reason": reason,
"prompt_preview": prompts[i][:100], "prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1, "bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN, "bsN_tokens": tokens_bsN,
}) }
)
continue continue
# Check if tokens match first # Check if tokens match first
if tokens_bs1 != tokens_bsN: if tokens_bs1 != tokens_bsN:
failed_prompts.append({ failed_prompts.append(
"prompt_idx": {
i, "prompt_idx": i,
"step": "step": "sampling",
"sampling", "reason": "Different tokens sampled",
"reason": "prompt_preview": prompts[i][:100],
"Different tokens sampled", "bs1_tokens": tokens_bs1,
"prompt_preview": "bsN_tokens": tokens_bsN,
prompts[i][:100], "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bs1_tokens": "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
tokens_bs1, }
"bsN_tokens": )
tokens_bsN,
"bs1_all_logprobs":
[logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bsN_all_logprobs":
[logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
})
continue continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape: if a.shape != b.shape:
failed_prompts.append({ failed_prompts.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
if not torch.equal(a, b): if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item() max_diff = torch.abs(a - b).max().item()
# Print which token failed # Print which token failed
print( print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
)
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}") print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}") print(f" BS=N logprob: {b.tolist()}")
failed_prompts.append({ failed_prompts.append(
"prompt_idx": {
i, "prompt_idx": i,
"step": "step": t,
t, "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"reason": "prompt_preview": prompts[i][:100],
f"Bitwise mismatch (max_diff={max_diff:.6e})", "bs1_tokens": tokens_bs1,
"prompt_preview": "bsN_tokens": tokens_bsN,
prompts[i][:100], "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
"bs1_tokens": "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
tokens_bs1, }
"bsN_tokens": )
tokens_bsN,
"bs1_all_logprobs": [
logprobs_bs1[s].tolist()
for s in range(len(logprobs_bs1))
],
"bsN_all_logprobs": [
logprobs_bsN[s].tolist()
for s in range(len(logprobs_bsN))
],
})
break break
del llm del llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
# Print summary of all failures # Print summary of all failures
if failed_prompts: if failed_prompts:
print(f"\n{'=' * 80}") print(f"\n{'=' * 80}")
fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/" fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
f"{len(prompts)} prompts failed")
print(fail_msg) print(fail_msg)
print(f"{'=' * 80}") print(f"{'=' * 80}")
for fail in failed_prompts: for fail in failed_prompts:
@@ -412,21 +384,18 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
print(f" BS=N tokens: {fail['bsN_tokens']}") print(f" BS=N tokens: {fail['bsN_tokens']}")
if "bs1_all_logprobs" in fail: if "bs1_all_logprobs" in fail:
print( print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
)
for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]): for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}") print(f" Step {step_idx}: {logprobs}")
print( print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
)
for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]): for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
print(f" Step {step_idx}: {logprobs}") print(f" Step {step_idx}: {logprobs}")
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
# Fail the test with summary # Fail the test with summary
msg = (f"Batch invariance violated in {len(failed_prompts)}/" msg = (
f"{len(prompts)} prompts. See output above for details.") f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
)
pytest.fail(msg) pytest.fail(msg)
@@ -476,8 +445,7 @@ def test_simple_generation(monkeypatch: pytest.MonkeyPatch):
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def test_logprobs_without_batch_invariance_should_fail( def test_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
monkeypatch: pytest.MonkeyPatch):
""" """
This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN. This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
It DISABLES batch invariance mode and expects to see non-deterministic behavior It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -540,15 +508,12 @@ def test_logprobs_without_batch_invariance_should_fail(
bs1_logprobs_per_prompt = [] bs1_logprobs_per_prompt = []
bs1_tokens_per_prompt = [] bs1_tokens_per_prompt = []
for idx, p in enumerate(prompts): for idx, p in enumerate(prompts):
print( print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
)
outs = llm.generate([p], sp, use_tqdm=False) outs = llm.generate([p], sp, use_tqdm=False)
assert len(outs) == 1 assert len(outs) == 1
step_logprobs, token_ids = _extract_step_logprobs(outs[0]) step_logprobs, token_ids = _extract_step_logprobs(outs[0])
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bs1_logprobs_per_prompt.append(step_logprobs) bs1_logprobs_per_prompt.append(step_logprobs)
bs1_tokens_per_prompt.append(token_ids) bs1_tokens_per_prompt.append(token_ids)
print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -569,74 +534,80 @@ def test_logprobs_without_batch_invariance_should_fail(
print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
step_logprobs, token_ids = _extract_step_logprobs(o) step_logprobs, token_ids = _extract_step_logprobs(o)
if step_logprobs is None: if step_logprobs is None:
pytest.skip("Logits are not available on RequestOutput; " pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
"enable logprobs return to run this test.")
bsN_logprobs_per_prompt.append(step_logprobs) bsN_logprobs_per_prompt.append(step_logprobs)
bsN_tokens_per_prompt.append(token_ids) bsN_tokens_per_prompt.append(token_ids)
# Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
differences_found = [] differences_found = []
for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
zip( zip(
bs1_logprobs_per_prompt, bs1_logprobs_per_prompt,
bsN_logprobs_per_prompt, bsN_logprobs_per_prompt,
bs1_tokens_per_prompt, bs1_tokens_per_prompt,
bsN_tokens_per_prompt, bsN_tokens_per_prompt,
)): )
):
if len(logprobs_bs1) != len(logprobs_bsN): if len(logprobs_bs1) != len(logprobs_bsN):
reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
f"vs {len(logprobs_bsN)} (BS=N)") differences_found.append(
differences_found.append({ {
"prompt_idx": i, "prompt_idx": i,
"step": "all", "step": "all",
"reason": reason, "reason": reason,
"prompt_preview": prompts[i][:100], "prompt_preview": prompts[i][:100],
"bs1_tokens": tokens_bs1, "bs1_tokens": tokens_bs1,
"bsN_tokens": tokens_bsN, "bsN_tokens": tokens_bsN,
}) }
)
continue continue
# Check if tokens match first # Check if tokens match first
if tokens_bs1 != tokens_bsN: if tokens_bs1 != tokens_bsN:
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": "sampling", "prompt_idx": i,
"reason": "Different tokens sampled", "step": "sampling",
"prompt_preview": prompts[i][:100], "reason": "Different tokens sampled",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
continue continue
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
if a.shape != b.shape: if a.shape != b.shape:
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Shape mismatch: {a.shape} vs {b.shape}", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
if not torch.equal(a, b): if not torch.equal(a, b):
max_diff = torch.abs(a - b).max().item() max_diff = torch.abs(a - b).max().item()
print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
f"Token {t}: max_diff={max_diff:.6e}")
bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
print(f" BS=1 logprob: {a.tolist()}") print(f" BS=1 logprob: {a.tolist()}")
print(f" BS=N logprob: {b.tolist()}") print(f" BS=N logprob: {b.tolist()}")
differences_found.append({ differences_found.append(
"prompt_idx": i, {
"step": t, "prompt_idx": i,
"reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", "step": t,
"prompt_preview": prompts[i][:100], "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
"bs1_tokens": tokens_bs1, "prompt_preview": prompts[i][:100],
"bsN_tokens": tokens_bsN, "bs1_tokens": tokens_bs1,
}) "bsN_tokens": tokens_bsN,
}
)
break break
del llm del llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@@ -646,7 +617,8 @@ def test_logprobs_without_batch_invariance_should_fail(
success_msg = ( success_msg = (
f"✓ SUCCESS: Batch invariance is doing something! " f"✓ SUCCESS: Batch invariance is doing something! "
f"Found {len(differences_found)}/{len(prompts)} prompts " f"Found {len(differences_found)}/{len(prompts)} prompts "
f"with differences when batch invariance was DISABLED.") f"with differences when batch invariance was DISABLED."
)
print(success_msg) print(success_msg)
print(f"{'=' * 80}") print(f"{'=' * 80}")
for diff in differences_found: for diff in differences_found:
@@ -666,7 +638,8 @@ def test_logprobs_without_batch_invariance_should_fail(
f"✗ UNEXPECTED: All {len(prompts)} prompts matched " f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
f"between BS=1 and BS=N even with batch invariance DISABLED. " f"between BS=1 and BS=N even with batch invariance DISABLED. "
f"This suggests batch invariance might not be necessary, " f"This suggests batch invariance might not be necessary, "
f"or the test needs more sensitive prompts.") f"or the test needs more sensitive prompts."
)
print(fail_msg) print(fail_msg)
print(f"{'=' * 80}\n") print(f"{'=' * 80}\n")
pytest.fail(fail_msg) pytest.fail(fail_msg)

View File

@@ -37,10 +37,7 @@ def test_end_to_end():
prompt = "How are you?" prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
with VllmRunner("Qwen/Qwen3-0.6B", with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True, cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
enable_sleep_mode=True,
cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
output = runner.model.generate(prompt, sampling_params) output = runner.model.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only # which is difficult to measure in the test. therefore, we only

View File

@@ -30,9 +30,7 @@ MODELS = ["Qwen/Qwen3-0.6B"]
def get_prompt_embeds(chat, tokenizer, embedding_layer): def get_prompt_embeds(chat, tokenizer, embedding_layer):
"""Convert chat messages to prompt embeddings.""" """Convert chat messages to prompt embeddings."""
token_ids = tokenizer.apply_chat_template(chat, token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt")
add_generation_prompt=True,
return_tensors='pt')
prompt_embeds = embedding_layer(token_ids).squeeze(0) prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds return prompt_embeds
@@ -53,15 +51,16 @@ def test_mixed_prompt_embeds_and_text(model_name):
# Run inference with mixed inputs # Run inference with mixed inputs
with VllmRunner( with VllmRunner(
model_name, model_name,
enable_prompt_embeds=True, enable_prompt_embeds=True,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_runner: ) as vllm_runner:
# Test prompt embeddings # Test prompt embeddings
embeds_output = vllm_runner.model.generate({ embeds_output = vllm_runner.model.generate(
"prompt_embeds": {
prompt_embeds, "prompt_embeds": prompt_embeds,
}) }
)
# Test text prompt # Test text prompt
text_output = vllm_runner.model.generate(text_prompt) text_output = vllm_runner.model.generate(text_prompt)

View File

@@ -107,15 +107,13 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
def _accuracy_test(llm: LLM, subscriber: MockSubscriber): def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
cpu_block_size = (llm.llm_engine.vllm_config.kv_transfer_config. cpu_block_size = llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config["block_size"]
kv_connector_extra_config["block_size"])
subscriber.get_new_cpu_stored_events() subscriber.get_new_cpu_stored_events()
# prepend prompt to be cpu block aligned # prepend prompt to be cpu block aligned
prompt = "Let's count to 10. One, two, three, four," prompt = "Let's count to 10. One, two, three, four,"
while (len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % while len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size != 0:
cpu_block_size != 0):
prompt = ". " + prompt prompt = ". " + prompt
assert subscriber.get_new_cpu_stored_events() assert subscriber.get_new_cpu_stored_events()
@@ -123,8 +121,7 @@ def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
test_count = 100 test_count = 100
success_count = 0 success_count = 0
for i in range(test_count): for i in range(test_count):
if (llm.generate(prompt, sampling_params, if llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text == " five":
use_tqdm=False)[0].outputs[0].text == " five"):
success_count += 1 success_count += 1
assert success_count >= 0.5 * test_count assert success_count >= 0.5 * test_count
@@ -143,7 +140,7 @@ def test_cpu_offloading() -> None:
"num_cpu_blocks": 1000, "num_cpu_blocks": 1000,
"block_size": 128, "block_size": 128,
"spec_name": "NPUOffloadingSpec", "spec_name": "NPUOffloadingSpec",
"spec_module_path": "vllm_ascend.kv_offload.npu" "spec_module_path": "vllm_ascend.kv_offload.npu",
}, },
) )

View File

@@ -17,7 +17,7 @@
# limitations under the License. # limitations under the License.
# #
import json import json
from typing import Any, Dict from typing import Any
import jsonschema import jsonschema
import pytest import pytest
@@ -34,8 +34,10 @@ GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def sample_regex(): def sample_regex():
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" return (
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
@@ -43,66 +45,41 @@ def sample_json_schema():
return { return {
"type": "object", "type": "object",
"properties": { "properties": {
"name": { "name": {"type": "string"},
"type": "string" "age": {"type": "integer"},
}, "skills": {"type": "array", "items": {"type": "string", "maxLength": 10}, "minItems": 3},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": { "work_history": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "type": "object",
"properties": { "properties": {
"company": { "company": {"type": "string"},
"type": "string" "duration": {"type": "number"},
}, "position": {"type": "string"},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
}, },
"required": ["company", "position"] "required": ["company", "position"],
} },
} },
}, },
"required": ["name", "age", "skills", "work_history"] "required": ["name", "age", "skills", "work_history"],
} }
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str, def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema):
sample_json_schema): runner_kwargs: dict[str, Any] = {}
runner_kwargs: Dict[str, Any] = {}
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=1.0, temperature=1.0, max_tokens=500, structured_outputs=StructuredOutputsParams(json=sample_json_schema)
max_tokens=500, )
structured_outputs=StructuredOutputsParams(json=sample_json_schema))
runner_kwargs = { runner_kwargs = {
"cudagraph_capture_sizes": [1, 2, 4, 8], "cudagraph_capture_sizes": [1, 2, 4, 8],
"seed": 0, "seed": 0,
"structured_outputs_config": { "structured_outputs_config": {"backend": guided_decoding_backend},
"backend": guided_decoding_backend
},
} }
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [ prompts = [f"Give an example JSON for an employee profile that fits this schema: {sample_json_schema}"] * 2
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
] * 2
inputs = vllm_model.get_inputs(prompts) inputs = vllm_model.get_inputs(prompts)
outputs = vllm_model.model.generate(inputs, outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
sampling_params=sampling_params)
assert outputs is not None assert outputs is not None
@@ -115,34 +92,27 @@ def test_guided_json_completion(guided_decoding_backend: str,
assert generated_text is not None assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text) output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, jsonschema.validate(instance=output_json, schema=sample_json_schema)
schema=sample_json_schema)
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_regex(guided_decoding_backend: str, sample_regex): def test_guided_regex(guided_decoding_backend: str, sample_regex):
if guided_decoding_backend == "outlines": if guided_decoding_backend == "outlines":
pytest.skip("Outlines doesn't support regex-based guided decoding.") pytest.skip("Outlines doesn't support regex-based guided decoding.")
runner_kwargs: Dict[str, Any] = {} runner_kwargs: dict[str, Any] = {}
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8, top_p=0.95, structured_outputs=StructuredOutputsParams(regex=sample_regex)
top_p=0.95, )
structured_outputs=StructuredOutputsParams(regex=sample_regex))
runner_kwargs = { runner_kwargs = {
"cudagraph_capture_sizes": [1, 2, 4, 8], "cudagraph_capture_sizes": [1, 2, 4, 8],
"seed": 0, "seed": 0,
"structured_outputs_config": { "structured_outputs_config": {"backend": guided_decoding_backend},
"backend": guided_decoding_backend
},
} }
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [ prompts = [f"Give an example IPv4 address with this regex: {sample_regex}"] * 2
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2
inputs = vllm_model.get_inputs(prompts) inputs = vllm_model.get_inputs(prompts)
outputs = vllm_model.model.generate(inputs, outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
sampling_params=sampling_params)
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
assert output is not None assert output is not None

View File

@@ -19,20 +19,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query= query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
), ),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query= query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
), ),
] ]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate( outputs = llm.generate(
prompts, prompts, sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
sampling_params, )
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs. # Print the outputs.
generated_texts: list[str] = [] generated_texts: list[str] = []
for output in outputs: for output in outputs:
@@ -45,16 +41,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def test_ilama_lora(ilama_lora_files): def test_ilama_lora(ilama_lora_files):
with VllmRunner( with VllmRunner(
MODEL_PATH, MODEL_PATH,
enable_lora=True, enable_lora=True,
dtype="half", dtype="half",
max_loras=4, max_loras=4,
max_model_len=1024, max_model_len=1024,
cudagraph_capture_sizes=[1, 2, 4, 8], cudagraph_capture_sizes=[1, 2, 4, 8],
max_num_seqs=16, max_num_seqs=16,
enforce_eager=True, enforce_eager=True,
) as vllm_model: ) as vllm_model:
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)): for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i] assert output1[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -1,12 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest from unittest.mock import patch
import pytest
import vllm import vllm
import vllm.config import vllm.config
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from unittest.mock import patch
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import enable_custom_op from vllm_ascend.utils import enable_custom_op
@@ -53,17 +53,12 @@ def do_sample(
PROMPT_TEMPLATE.format(context="How many candidates are there?"), PROMPT_TEMPLATE.format(context="How many candidates are there?"),
PROMPT_TEMPLATE.format(context="Count the number of candidates."), PROMPT_TEMPLATE.format(context="Count the number of candidates."),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
context= context="Which poll resource provided the most number of candidate information?" # noqa: E501
"Which poll resource provided the most number of candidate information?" # noqa: E501
), ),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(context="Return the poll resource associated with the most candidates."),
context=
"Return the poll resource associated with the most candidates."),
] ]
sampling_params = vllm.SamplingParams(temperature=0, sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop=["<|im_end|>"])
max_tokens=64,
stop=["<|im_end|>"])
if tensorizer_config_dict is not None: if tensorizer_config_dict is not None:
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
@@ -73,14 +68,15 @@ def do_sample(
lora_id, lora_id,
lora_path, lora_path,
tensorizer_config_dict=tensorizer_config_dict, tensorizer_config_dict=tensorizer_config_dict,
) if lora_id else None, )
if lora_id
else None,
) )
else: else:
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
if lora_id else None,
) )
generated_texts: list[str] = [] generated_texts: list[str] = []
@@ -92,33 +88,40 @@ def do_sample(
return generated_texts return generated_texts
def generate_and_test(llm, def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | None = None):
llama32_lora_files,
tensorizer_config_dict: dict | None = None):
print("lora adapter created") print("lora adapter created")
print("lora 1") print("lora 1")
assert (do_sample( assert (
llm, do_sample(
llama32_lora_files, llm,
tensorizer_config_dict=tensorizer_config_dict, llama32_lora_files,
lora_id=1, tensorizer_config_dict=tensorizer_config_dict,
) == EXPECTED_LORA_OUTPUT) lora_id=1,
)
== EXPECTED_LORA_OUTPUT
)
print("lora 2") print("lora 2")
assert (do_sample( assert (
llm, do_sample(
llama32_lora_files, llm,
tensorizer_config_dict=tensorizer_config_dict, llama32_lora_files,
lora_id=2, tensorizer_config_dict=tensorizer_config_dict,
) == EXPECTED_LORA_OUTPUT) lora_id=2,
)
== EXPECTED_LORA_OUTPUT
)
print("base model") print("base model")
assert (do_sample( assert (
llm, do_sample(
llama32_lora_files, llm,
tensorizer_config_dict=tensorizer_config_dict, llama32_lora_files,
lora_id=0, tensorizer_config_dict=tensorizer_config_dict,
) == EXPECTED_BASE_MODEL_OUTPUT) lora_id=0,
)
== EXPECTED_BASE_MODEL_OUTPUT
)
print("removing lora") print("removing lora")

View File

@@ -45,9 +45,7 @@ def test_minicpm(model) -> None:
] ]
max_tokens = 5 max_tokens = 5
with VllmRunner(model, with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner:
max_model_len=512,
gpu_memory_utilization=0.7) as runner:
runner.generate_greedy(example_prompts, max_tokens) runner.generate_greedy(example_prompts, max_tokens)
@@ -56,19 +54,12 @@ def test_whisper(model) -> None:
prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"] prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate] audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
sampling_params = SamplingParams(temperature=0.2, sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
max_tokens=10,
stop_token_ids=None)
with VllmRunner(model, with VllmRunner(
max_model_len=448, model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", block_size=128, gpu_memory_utilization=0.9
max_num_seqs=5, ) as runner:
dtype="bfloat16", outputs = runner.generate(prompts=prompts, audios=audios, sampling_params=sampling_params)
block_size=128,
gpu_memory_utilization=0.9) as runner:
outputs = runner.generate(prompts=prompts,
audios=audios,
sampling_params=sampling_params)
assert outputs is not None, "Generated outputs should not be None." assert outputs is not None, "Generated outputs should not be None."
assert len(outputs) > 0, "Generated outputs should not be empty." assert len(outputs) > 0, "Generated outputs should not be empty."

View File

@@ -39,59 +39,56 @@ def test_models_with_multistream_overlap_shared_expert(
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
prompts = [ prompts = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
] ]
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=1024, max_model_len=1024,
enforce_eager=True, enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32], cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={ additional_config={
"multistream_overlap_shared_expert": True, "multistream_overlap_shared_expert": True,
}, },
quantization="ascend", quantization="ascend",
) as runner: ) as runner:
vllm_moe_ms_eager_outputs = runner.model.generate( vllm_moe_ms_eager_outputs = runner.model.generate(prompts, sampling_params)
prompts, sampling_params)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=1024, max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 16, 32], cudagraph_capture_sizes=[4, 8, 16, 32],
additional_config={ additional_config={
"multistream_overlap_shared_expert": True, "multistream_overlap_shared_expert": True,
}, },
quantization="ascend", quantization="ascend",
) as runner: ) as runner:
vllm_moe_ms_aclgraph_outputs = runner.model.generate( vllm_moe_ms_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
prompts, sampling_params)
with VllmRunner( with VllmRunner(
model, model,
max_model_len=1024, max_model_len=1024,
enforce_eager=True, enforce_eager=True,
cudagraph_capture_sizes=[4, 8, 16, 32], cudagraph_capture_sizes=[4, 8, 16, 32],
quantization="ascend", quantization="ascend",
) as runner: ) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params) vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_moe_ms_eager_outputs_list = [] vllm_moe_ms_eager_outputs_list = []
for output in vllm_moe_ms_eager_outputs: for output in vllm_moe_ms_eager_outputs:
vllm_moe_ms_eager_outputs_list.append( vllm_moe_ms_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
vllm_moe_ms_aclgraph_outputs_list = [] vllm_moe_ms_aclgraph_outputs_list = []
for output in vllm_moe_ms_aclgraph_outputs: for output in vllm_moe_ms_aclgraph_outputs:
vllm_moe_ms_aclgraph_outputs_list.append( vllm_moe_ms_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = [] vllm_eager_outputs_list = []
for output in vllm_eager_outputs: for output in vllm_eager_outputs:
vllm_eager_outputs_list.append( vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal( check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list, outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -19,6 +19,7 @@ from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal from tests.e2e.model_utils import check_outputs_equal
# fmt: off
def test_qwen3_w8a8_quant(): def test_qwen3_w8a8_quant():
max_tokens = 5 max_tokens = 5
example_prompts = [ example_prompts = [
@@ -29,6 +30,7 @@ def test_qwen3_w8a8_quant():
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387 13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be' ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)] )]
# fmt: on
with VllmRunner( with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A8", "vllm-ascend/Qwen3-0.6B-W8A8",
@@ -47,7 +49,7 @@ def test_qwen3_w8a8_quant():
name_1="vllm_quant_w8a8_outputs", name_1="vllm_quant_w8a8_outputs",
) )
# fmt: off
def test_qwen3_dense_w8a16(): def test_qwen3_dense_w8a16():
max_tokens = 5 max_tokens = 5
example_prompts = [ example_prompts = [
@@ -58,6 +60,7 @@ def test_qwen3_dense_w8a16():
13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387 13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be' ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
)] )]
# fmt: on
with VllmRunner( with VllmRunner(
"vllm-ascend/Qwen3-0.6B-W8A16", "vllm-ascend/Qwen3-0.6B-W8A16",

View File

@@ -1,8 +1,9 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from unittest.mock import patch
from tests.e2e.conftest import VllmRunner from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import enable_custom_op from vllm_ascend.utils import enable_custom_op
@@ -27,16 +28,11 @@ LORA_TEST_EXPECTED = [
def format_chatml_messages(prompt: str): def format_chatml_messages(prompt: str):
return [ return [
{ {"role": "system", "content": "You are a helpful assistant."},
"role": "system", {"role": "user", "content": prompt},
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
},
] ]
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"}) @patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
def test_multi_loras_with_tp_sync(): def test_multi_loras_with_tp_sync():
lora_name_id_map = {} lora_name_id_map = {}
@@ -102,9 +98,7 @@ def test_multi_loras_with_tp_sync():
outputs = llm.chat( outputs = llm.chat(
[messages], [messages],
sampling_params, sampling_params,
chat_template_kwargs={ chat_template_kwargs={"enable_thinking": False}, # for those loras, ensure enable_thinking=False
"enable_thinking": False
}, # for those loras, ensure enable_thinking=False
lora_request=lora_request, lora_request=lora_request,
use_tqdm=False, use_tqdm=False,
) )
@@ -117,11 +111,9 @@ def test_multi_loras_with_tp_sync():
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
for dynamic lora loading and unloading for dynamic lora loading and unloading
""" """
remove_lora_response = llm.llm_engine.remove_lora( remove_lora_response = llm.llm_engine.remove_lora(lora_id=lora_name_id_map[name])
lora_id=lora_name_id_map[name])
add_lora_response = llm.llm_engine.add_lora( add_lora_response = llm.llm_engine.add_lora(make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
print(f"{remove_lora_response=}, {add_lora_response=}") print(f"{remove_lora_response=}, {add_lora_response=}")
@@ -131,7 +123,6 @@ def test_multi_loras_with_tp_sync():
assert outputs == expected assert outputs == expected
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED): for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
output_text = call_llm_get_outputs(prompt, "Alice") output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output, prompt) check_outputs(output_text, expected_output, prompt)

View File

@@ -25,15 +25,11 @@ def test_qwen3_topk() -> None:
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B", with VllmRunner(
max_model_len=8192, "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
cudagraph_capture_sizes=[1, 2, 4, 8], ) as runner:
gpu_memory_utilization=0.7) as runner:
runner.generate(example_prompts, sampling_params) runner.generate(example_prompts, sampling_params)
@@ -42,29 +38,25 @@ def test_qwen3_prompt_logprobs() -> None:
"Hello, my name is", "Hello, my name is",
] ]
with VllmRunner("Qwen/Qwen3-0.6B", with VllmRunner(
max_model_len=8192, "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
cudagraph_capture_sizes=[1, 2, 4, 8], ) as runner:
gpu_memory_utilization=0.7) as runner: runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1)
runner.generate_greedy_logprobs(example_prompts,
max_tokens=5,
num_logprobs=1)
def test_qwen3_exponential_overlap() -> None: def test_qwen3_exponential_overlap() -> None:
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
sampling_params = SamplingParams(max_tokens=5, sampling_params = SamplingParams(max_tokens=5, temperature=1.0, top_k=50, top_p=0.9)
temperature=1.0,
top_k=50,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B", with VllmRunner(
max_model_len=8192, "Qwen/Qwen3-0.6B",
cudagraph_capture_sizes=[1, 2, 4, 8], max_model_len=8192,
gpu_memory_utilization=0.7, cudagraph_capture_sizes=[1, 2, 4, 8],
additional_config={ gpu_memory_utilization=0.7,
"enable_async_exponential": True, additional_config={
}) as runner: "enable_async_exponential": True,
},
) as runner:
runner.generate(example_prompts, sampling_params) runner.generate(example_prompts, sampling_params)

View File

@@ -20,6 +20,7 @@
Run `pytest tests/test_offline_inference.py`. Run `pytest tests/test_offline_inference.py`.
""" """
import os import os
from unittest.mock import patch from unittest.mock import patch
@@ -44,11 +45,13 @@ def test_multimodal_vl(vl_config):
images = [image] * len(img_questions) images = [image] * len(img_questions)
prompts = vl_config["prompt_fn"](img_questions) prompts = vl_config["prompt_fn"](img_questions)
with VllmRunner(vl_config["model"], with VllmRunner(
mm_processor_kwargs=vl_config["mm_processor_kwargs"], vl_config["model"],
max_model_len=8192, mm_processor_kwargs=vl_config["mm_processor_kwargs"],
cudagraph_capture_sizes=[1, 2, 4, 8], max_model_len=8192,
limit_mm_per_prompt={"image": 1}) as vllm_model: cudagraph_capture_sizes=[1, 2, 4, 8],
limit_mm_per_prompt={"image": 1},
) as vllm_model:
outputs = vllm_model.generate_greedy( outputs = vllm_model.generate_greedy(
prompts=prompts, prompts=prompts,
images=images, images=images,
@@ -63,35 +66,30 @@ def test_multimodal_vl(vl_config):
@patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"}) @patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"})
def test_multimodal_audio(): def test_multimodal_audio():
audio_prompt = "".join([ audio_prompt = "".join([f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(2)])
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
for idx in range(2)
])
question = "What sport and what nursery rhyme are referenced?" question = "What sport and what nursery rhyme are referenced?"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" prompt = (
"<|im_start|>user\n" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"{audio_prompt}{question}<|im_end|>\n" "<|im_start|>user\n"
"<|im_start|>assistant\n") f"{audio_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
mm_data = { mm_data = {
"audio": [ "audio": [asset.audio_and_sample_rate for asset in [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]]
asset.audio_and_sample_rate for asset in
[AudioAsset("mary_had_lamb"),
AudioAsset("winning_call")]
]
} }
inputs = {"prompt": prompt, "multi_modal_data": mm_data} inputs = {"prompt": prompt, "multi_modal_data": mm_data}
sampling_params = SamplingParams(temperature=0.2, sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
max_tokens=10,
stop_token_ids=None)
with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct", with VllmRunner(
max_model_len=4096, "Qwen/Qwen2-Audio-7B-Instruct",
max_num_seqs=5, max_model_len=4096,
dtype="bfloat16", max_num_seqs=5,
limit_mm_per_prompt={"audio": 2}, dtype="bfloat16",
cudagraph_capture_sizes=[1, 2, 4, 8], limit_mm_per_prompt={"audio": 2},
gpu_memory_utilization=0.9) as runner: cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.9,
) as runner:
outputs = runner.generate(inputs, sampling_params=sampling_params) outputs = runner.generate(inputs, sampling_params=sampling_params)
assert outputs is not None, "Generated outputs should not be None." assert outputs is not None, "Generated outputs should not be None."

View File

@@ -20,13 +20,14 @@ Compare the outputs of vLLM with and without xlite.
Run `pytest tests/e2e/singlecard/test_xlite.py`. Run `pytest tests/e2e/singlecard/test_xlite.py`.
""" """
# ruff: noqa: E501
import os import os
import pytest import pytest
from vllm import SamplingParams from vllm import SamplingParams
from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase, from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
gen_and_valid)
os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2" os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
@@ -35,9 +36,9 @@ CASE_DECODE_ONLY = LLMTestCase(
prompts=PROMPTS_SHORT, prompts=PROMPTS_SHORT,
golden_answers=[ golden_answers=[
"Hello, my name is Lina. I'm a 22-year-old student from China.", "Hello, my name is Lina. I'm a 22-year-old student from China.",
'The president of the United States is the same as the president of the United Nations. This is because the president', "The president of the United States is the same as the president of the United Nations. This is because the president",
'The capital of France is Paris. The capital of France is also the capital of the French Republic.', "The capital of France is Paris. The capital of France is also the capital of the French Republic.",
'The future of AI is not just a technological challenge but a profound transformation of how we live, work' "The future of AI is not just a technological challenge but a profound transformation of how we live, work",
], ],
sampling_params=SamplingParams( sampling_params=SamplingParams(
max_tokens=15, max_tokens=15,
@@ -45,19 +46,22 @@ CASE_DECODE_ONLY = LLMTestCase(
top_p=1.0, top_p=1.0,
top_k=0, top_k=0,
n=1, n=1,
)) ),
)
CASE_FULL = LLMTestCase( CASE_FULL = LLMTestCase(
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
prompts=[ prompts=[
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
], ],
golden_answers=[ golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital', " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
" not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and" " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
], ],
sampling_params=SamplingParams( sampling_params=SamplingParams(
max_tokens=32, max_tokens=32,
@@ -65,27 +69,25 @@ CASE_FULL = LLMTestCase(
top_p=1.0, top_p=1.0,
top_k=0, top_k=0,
n=1, n=1,
)) ),
)
@pytest.mark.skip( @pytest.mark.skip(reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
@pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY]) @pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY])
def test_models_with_xlite_decode_only(cur_case: LLMTestCase): def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
runner_kwargs = { runner_kwargs = {
"model_name": cur_case.model, "model_name": cur_case.model,
"max_model_len": 1024, "max_model_len": 1024,
"block_size": 128, "block_size": 128,
"additional_config": { "additional_config": {"xlite_graph_config": {"enabled": True}},
"xlite_graph_config": {
"enabled": True
}
},
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)
@pytest.mark.parametrize("cur_case", [CASE_FULL]) @pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -94,14 +96,11 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
"model_name": cur_case.model, "model_name": cur_case.model,
"max_model_len": 1024, "max_model_len": 1024,
"block_size": 128, "block_size": 128,
"additional_config": { "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
"xlite_graph_config": {
"enabled": True,
"full_mode": True
}
},
} }
gen_and_valid(runner_kwargs=runner_kwargs, gen_and_valid(
prompts=cur_case.prompts, runner_kwargs=runner_kwargs,
sampling_params=cur_case.sampling_params, prompts=cur_case.prompts,
golden_answers=cur_case.golden_answers) sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers,
)

View File

@@ -1,5 +1,4 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional
from vllm import SamplingParams from vllm import SamplingParams
@@ -7,37 +6,44 @@ from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal from tests.e2e.model_utils import check_outputs_equal
PROMPTS_SHORT = [ PROMPTS_SHORT = [
"Hello, my name is", "The president of the United States is", "Hello, my name is",
"The capital of France is", "The future of AI is" "The president of the United States is",
"The capital of France is",
"The future of AI is",
] ]
# NOTE: Randomly fill the prompt with the requested amount for # NOTE: Randomly fill the prompt with the requested amount for
# the specified capture shape to prevent accuracy issues caused by padding # the specified capture shape to prevent accuracy issues caused by padding
PROMPTS_LONG = [ PROMPTS_LONG = [
('Solve the following math problem step by step.' (
'The last line of your response should be of the form Answer: ' "Solve the following math problem step by step."
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n' "The last line of your response should be of the form Answer: "
'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$' "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,' "In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$"
'$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.' "be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,"
'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,' "$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$."
'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.' "If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,"
), "where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$."
('Solve the following math problem step by step.' ),
'The last line of your response should be of the form Answer: ' (
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n' "Solve the following math problem step by step."
'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen' "The last line of your response should be of the form Answer: "
'independently and uniformly at random on the perimeter of $ABCD$.' "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
'If the expected value of the area of triangle $\\triangle AXY$' "Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen"
'can be expressed as $\\frac{m}{n}$, for relatively prime positive' "independently and uniformly at random on the perimeter of $ABCD$."
'integers $m$ and $n$, compute $m+n$.'), "If the expected value of the area of triangle $\\triangle AXY$"
('Solve the following math problem step by step.' "can be expressed as $\\frac{m}{n}$, for relatively prime positive"
'The last line of your response should be of the form Answer: ' "integers $m$ and $n$, compute $m+n$."
'$Answer (without quotes) where $Answer is the answer to the problem.\n\n' ),
'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$' (
'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$' "Solve the following math problem step by step."
'and $x^2 + cx + b = 0$ also have a common real root.' "The last line of your response should be of the form Answer: "
'Compute the sum $a + b + c$.') "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
"Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$"
"and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$"
"and $x^2 + cx + b = 0$ also have a common real root."
"Compute the sum $a + b + c$."
),
] ]
@@ -46,7 +52,7 @@ class LLMTestCase:
model: str model: str
prompts: list[str] prompts: list[str]
golden_answers: list[str] golden_answers: list[str]
quantization: Optional[str] = None quantization: str | None = None
sampling_params: SamplingParams = field( sampling_params: SamplingParams = field(
default_factory=lambda: SamplingParams( default_factory=lambda: SamplingParams(
max_tokens=32, max_tokens=32,
@@ -54,14 +60,13 @@ class LLMTestCase:
top_p=1.0, top_p=1.0,
top_k=0, top_k=0,
n=1, n=1,
)) )
)
def gen_and_valid(runner_kwargs: dict, prompts: list[str], def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]):
sampling_params: SamplingParams, golden_answers: list[str]):
with VllmRunner(**runner_kwargs) as runner: with VllmRunner(**runner_kwargs) as runner:
vllm_aclgraph_outputs = runner.model.generate( vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params)
prompts=prompts, sampling_params=sampling_params)
outputs_gen = [] outputs_gen = []
for output in vllm_aclgraph_outputs: for output in vllm_aclgraph_outputs:
outputs_gen.append(([output.outputs[0].index], output.outputs[0].text)) outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))