diff --git a/pyproject.toml b/pyproject.toml index 7e90ef2d..665950c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,11 +46,41 @@ plugins.md024.allow_different_nesting = true # no-duplicate-headers plugins.md029.enabled = false # ol-prefix [tool.ruff] -# TODO: according to PEP8, there should be 80 characters per line +# TODO: according to PEP8, there should be 120 characters per line line-length = 120 # Folder to be modified exclude = [ - "tests/**", + # Batch (1) + "tests/e2e/__init__.py", + "tests/e2e/310p/", + "tests/e2e/conftest.py", + "tests/e2e/doctests/", + "tests/e2e/model_utils.py", + "tests/e2e/models/", + "tests/e2e/multicard/2-cards/", + + # Batch (2) + "tests/e2e/multicard/4-cards/", + "tests/e2e/nightly/multi_node/", + + # Batch (3) + "tests/e2e/nightly/single_node/models/", + + # Batch (4) + "tests/e2e/nightly/single_node/ops/", + + # Batch (5) + # "tests/e2e/singlecard/", + + # Batch (6) + "tests/e2e/nightly/single_node/ops/singlecard_ops/triton/", + "tests/e2e/singlecard/pooling/", + "tests/e2e/singlecard/spec_decode/", + "tests/e2e/utils.py", + "tests/e2e/vllm_interface/", + "tests/e2e/weekly/", + + "tests/ut/", ] [tool.ruff.lint] diff --git a/tests/e2e/singlecard/compile/backend.py b/tests/e2e/singlecard/compile/backend.py index 3776a252..e0fde30c 100644 --- a/tests/e2e/singlecard/compile/backend.py +++ b/tests/e2e/singlecard/compile/backend.py @@ -14,8 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from collections.abc import Callable, Sequence from copy import deepcopy -from typing import Any, Callable, List, Optional, Sequence +from typing import Any import torch.fx as fx from torch._inductor.decomposition import select_decomp_table @@ -37,7 +38,7 @@ class TestBackend: records the FX graph before and after the transformation. """ - def __init__(self, custom_passes: Optional[List[Any]] = None): + def __init__(self, custom_passes: list[Any] | None = None): vllm_config = get_current_vllm_config() compile_config = vllm_config.compilation_config self.inductor_config = compile_config.inductor_compile_config @@ -48,9 +49,7 @@ class TestBackend: self.graph_pre_pass = None self.graph_post_pass = None - def post_pass(self, - graph: fx.Graph, - runtime_shape: int | None = None) -> fx.Graph: + def post_pass(self, graph: fx.Graph, runtime_shape: int | None = None) -> fx.Graph: """ Apply custom graph transformation passes. """ @@ -62,13 +61,13 @@ class TestBackend: return graph def compile( - self, - graph: fx.GraphModule, - example_inputs: list[Any], - compiler_config: dict[str, Any], - runtime_shape: Optional[int] = None, - key: Optional[str] = None - ) -> tuple[Optional[Callable], Optional[Any]]: + self, + graph: fx.GraphModule, + example_inputs: list[Any], + compiler_config: dict[str, Any], + runtime_shape: int | None = None, + key: str | None = None, + ) -> tuple[Callable | None, Any | None]: """ Compile the FX graph using vLLM's Ascend compiler interface. Wraps the post-pass logic into the inner_compile callback. @@ -87,8 +86,7 @@ class TestBackend: ) return compiled_fn, None - def __call__(self, gm: fx.GraphModule, - example_inputs: Optional[List[Any]]): + def __call__(self, gm: fx.GraphModule, example_inputs: list[Any] | None): """ Make the backend callable by torch.compile(). Returns a compiled executable function. @@ -103,17 +101,11 @@ class TestBackend: ) return compiled_fn - def find_nodes_by_target(self, graph: fx.GraphModule, - target: OpOverload) -> List[fx.Node]: + def find_nodes_by_target(self, graph: fx.GraphModule, target: OpOverload) -> list[fx.Node]: """Helper to find all FX nodes that call a specific operator.""" - return [ - node for node in graph.graph.nodes - if hasattr(node, 'target') and node.target == target - ] + return [node for node in graph.graph.nodes if hasattr(node, "target") and node.target == target] - def check_before_ops(self, - ops: Sequence[OpOverload], - fully_replaced: bool = True): + def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced: bool = True): """ Verify that the original (unfused) operators exist before the pass and are fully removed afterward (if fully_replaced=True). diff --git a/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py index 2b231a4d..1fa40e36 100644 --- a/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py +++ b/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py @@ -215,6 +215,7 @@ def register_pattern_safe(pattern_class, vllm_config, eps, pattern_key): try: # Import the required pass class from torch._inductor.pattern_matcher import PatternMatcherPass + pm_pass = PatternMatcherPass() pattern.register(pm_pass) _registered_patterns.add(pattern_key) @@ -243,7 +244,7 @@ def test_rmsnorm_quant_fusion( sp_enable: bool, ): # Check if fusion operator is available - if not hasattr(torch.ops.npu, 'npu_add_rms_norm_quant'): + if not hasattr(torch.ops.npu, "npu_add_rms_norm_quant"): pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test") vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) @@ -266,7 +267,7 @@ def test_rmsnorm_quant_fusion( if not enable_custom_op(): pytest.skip("Custom ops not available, skipping bias test") # Check if the bias operator exists - if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'): + if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"): pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test") if sp_enable: model = ModelSPWithBias(hidden_size, dtype, eps, device="npu") @@ -281,13 +282,11 @@ def test_rmsnorm_quant_fusion( else: # The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching # so we need to skip if it's not available - if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'): + if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"): pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test") if sp_enable: model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu") - register_pattern_safe( - AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern" - ) + register_pattern_safe(AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern") else: model = ModelWithoutBias(hidden_size, dtype, eps, device="npu") register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern") @@ -302,5 +301,9 @@ def test_rmsnorm_quant_fusion( compiled_out, compiled_res = compiled_model(x) # Verify output shapes are correct - assert compiled_out.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}" - assert compiled_res.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}" + assert compiled_out.shape == (num_tokens, hidden_size), ( + f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}" + ) + assert compiled_res.shape == (num_tokens, hidden_size), ( + f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}" + ) diff --git a/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py b/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py index 7bd36880..7298ecb9 100644 --- a/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py +++ b/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py @@ -201,6 +201,7 @@ def test_rmsnorm_quant_fusion( vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps ) from torch._inductor.pattern_matcher import PatternMatcherPass + pm_pass = PatternMatcherPass() fusion_pattern.register(pm_pass) model = model.to("npu") diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py index b272c64f..00b2b123 100644 --- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py +++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py @@ -14,25 +14,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import List import pytest import torch import torch.nn as nn -import torch_npu import vllm.config from vllm.config import ModelConfig, VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) +from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment from vllm.utils.system_utils import update_environment_variables import vllm_ascend.ops.register_custom_ops # noqa from tests.e2e.singlecard.compile.backend import TestBackend from vllm_ascend.ascend_forward_context import set_ascend_forward_context -from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \ - AddRMSNormQuantFusionPass -from vllm_ascend.utils import enable_custom_op -from vllm_ascend.utils import vllm_version_is +from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass +from vllm_ascend.utils import enable_custom_op, vllm_version_is if vllm_version_is("0.15.0"): from vllm.compilation.fx_utils import OpOverload # type: ignore @@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config): """Get or create backend with fusion passes (cached to avoid duplicate pattern registration).""" global _backend_cache if _backend_cache is None: - _backend_cache = TestBackend(custom_passes=[ - AddRMSNormQuantFusionPass(vllm_config=vllm_config) - ]) + _backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)]) return _backend_cache + class TestModelWithoutBias(nn.Module): """ A minimal test model that simulates the pattern: AddRMSNorm → Quantization (without bias) """ - def __init__(self, - hidden_size: int, - dtype: torch.dtype, - eps: float = 1e-6, - device="npu"): + def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"): super().__init__() self.hidden_size = hidden_size self.eps = eps - self.rms_norm_weight = nn.Parameter( - torch.randn(hidden_size, device=device)) + self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device)) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) - self.quant_scale_reciprocal = torch.ones(hidden_size, - dtype=dtype, - device=device) - self.quant_offset = torch.zeros(hidden_size, - dtype=dtype, - device=device) + self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device) + self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device) def forward(self, x): """ @@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module): residual = torch.zeros_like(x) norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( - x, residual, self.rms_norm_weight, None, self.eps) + x, residual, self.rms_norm_weight, None, self.eps + ) - quantized_output = torch.ops.vllm.quantize(norm_output, - self.quant_scale, - self.quant_scale_reciprocal, - self.quant_offset) + quantized_output = torch.ops.vllm.quantize( + norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset + ) return quantized_output, new_residual - def ops_in_model_before(self) -> List[OpOverload]: + def ops_in_model_before(self) -> list[OpOverload]: """Return the list of expected operators BEFORE fusion.""" - return [ - torch.ops._C_ascend.npu_add_rms_norm_bias.default, - torch.ops.vllm.quantize.default - ] + return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default] - def ops_in_model_after(self) -> List[OpOverload]: + def ops_in_model_after(self) -> list[OpOverload]: """Return the list of expected operators AFTER successful fusion.""" return [torch.ops.npu.npu_add_rms_norm_quant.default] @@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module): AddRMSNorm → Add Bias → Quantization (with bias) """ - def __init__(self, - hidden_size: int, - dtype: torch.dtype, - eps: float = 1e-6, - device="npu"): + def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"): super().__init__() self.hidden_size = hidden_size self.eps = eps - self.rms_norm_weight = nn.Parameter( - torch.randn(hidden_size, device=device)) + self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device)) self.bias = nn.Parameter(torch.randn(hidden_size, device=device)) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) - self.quant_scale_reciprocal = torch.ones(hidden_size, - dtype=dtype, - device=device) - self.quant_offset = torch.zeros(hidden_size, - dtype=dtype, - device=device) + self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device) + self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device) def forward(self, x): """ @@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module): residual = torch.zeros_like(x) norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( - x, residual, self.rms_norm_weight, self.bias, self.eps) + x, residual, self.rms_norm_weight, self.bias, self.eps + ) - quantized_output = torch.ops.vllm.quantize(norm_output_with_bias, - self.quant_scale, - self.quant_scale_reciprocal, - self.quant_offset) + quantized_output = torch.ops.vllm.quantize( + norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset + ) return quantized_output, new_residual - def ops_in_model_before(self) -> List[OpOverload]: + def ops_in_model_before(self) -> list[OpOverload]: """Return the list of expected operators BEFORE fusion.""" - return [ - torch.ops._C_ascend.npu_add_rms_norm_bias.default, - torch.ops.vllm.quantize.default - ] + return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default] - def ops_in_model_after(self) -> List[OpOverload]: + def ops_in_model_after(self) -> list[OpOverload]: """Return the list of expected operators AFTER successful fusion.""" return [torch.ops.npu.npu_add_rms_norm_quant.default] @@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module): AddRMSNorm → maybe_allgather → Quantization (without bias) """ - def __init__(self, - hidden_size: int, - dtype: torch.dtype, - eps: float = 1e-6, - device="npu"): + def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"): super().__init__() self.hidden_size = hidden_size self.eps = eps - self.rms_norm_weight = nn.Parameter( - torch.randn(hidden_size, device=device)) + self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device)) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) - self.quant_scale_reciprocal = torch.ones(hidden_size, - dtype=dtype, - device=device) - self.quant_offset = torch.zeros(hidden_size, - dtype=dtype, - device=device) + self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device) + self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device) def forward(self, x): """ @@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module): residual = torch.zeros_like(x) norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( - x, residual, self.rms_norm_weight, None, self.eps) + x, residual, self.rms_norm_weight, None, self.eps + ) - norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad( - norm_output, True) + norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True) - quantized_output = torch.ops.vllm.quantize(norm_output, - self.quant_scale, - self.quant_scale_reciprocal, - self.quant_offset) + quantized_output = torch.ops.vllm.quantize( + norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset + ) return quantized_output, new_residual - def ops_in_model_before(self) -> List[OpOverload]: + def ops_in_model_before(self) -> list[OpOverload]: """Return the list of expected operators BEFORE fusion.""" return [ torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default, - torch.ops.vllm.quantize.default + torch.ops.vllm.quantize.default, ] - def ops_in_model_after(self) -> List[OpOverload]: + def ops_in_model_after(self) -> list[OpOverload]: """Return the list of expected operators AFTER successful fusion.""" - return [ - torch.ops.npu.npu_add_rms_norm_quant.default, - torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default - ] + return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default] class TestModelSPWithBias(nn.Module): @@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module): AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias) """ - def __init__(self, - hidden_size: int, - dtype: torch.dtype, - eps: float = 1e-6, - device="npu"): + def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"): super().__init__() self.hidden_size = hidden_size self.eps = eps - self.rms_norm_weight = nn.Parameter( - torch.randn(hidden_size, device=device)) + self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device)) self.bias = nn.Parameter(torch.randn(hidden_size, device=device)) self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device) - self.quant_scale_reciprocal = torch.ones(hidden_size, - dtype=dtype, - device=device) - self.quant_offset = torch.zeros(hidden_size, - dtype=dtype, - device=device) + self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device) + self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device) def forward(self, x): """ @@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module): residual = torch.zeros_like(x) norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias( - x, residual, self.rms_norm_weight, self.bias, self.eps) + x, residual, self.rms_norm_weight, self.bias, self.eps + ) - norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad( - norm_output_with_bias, True) + norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True) - quantized_output = torch.ops.vllm.quantize(norm_output_with_bias, - self.quant_scale, - self.quant_scale_reciprocal, - self.quant_offset) + quantized_output = torch.ops.vllm.quantize( + norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset + ) return quantized_output, new_residual - def ops_in_model_before(self) -> List[OpOverload]: + def ops_in_model_before(self) -> list[OpOverload]: """Return the list of expected operators BEFORE fusion.""" return [ torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default, - torch.ops.vllm.quantize.default + torch.ops.vllm.quantize.default, ] - def ops_in_model_after(self) -> List[OpOverload]: + def ops_in_model_after(self) -> list[OpOverload]: """Return the list of expected operators AFTER successful fusion.""" - return [ - torch.ops.npu.npu_add_rms_norm_quant.default, - torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default - ] + return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default] @pytest.mark.parametrize("dtype", [torch.bfloat16]) @@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion( vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) with vllm.config.set_current_vllm_config(vllm_config): - update_environment_variables({ - "RANK": "0", - "LOCAL_RANK": "0", - "WORLD_SIZE": "1", - "MASTER_ADDR": "localhost", - "MASTER_PORT": "12345", - }) + update_environment_variables( + { + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12345", + } + ) init_distributed_environment() ensure_model_parallel_initialized(1, 1) - with vllm.config.set_current_vllm_config(vllm_config): - with set_ascend_forward_context(None, vllm_config): - backend = get_or_create_backend(vllm_config) - if use_bias: - if not enable_custom_op(): - return - if sp_enable: - model = TestModelSPWithBias(hidden_size, - dtype, - eps, - device="npu") - else: - model = TestModelWithBias(hidden_size, - dtype, - eps, - device="npu") + with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config): + backend = get_or_create_backend(vllm_config) + if use_bias: + if not enable_custom_op(): + return + if sp_enable: + model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu") else: - if sp_enable: - model = TestModelSPWithoutBias(hidden_size, - dtype, - eps, - device="npu") - else: - model = TestModelWithoutBias(hidden_size, - dtype, - eps, - device="npu") - model = model.to("npu") + model = TestModelWithBias(hidden_size, dtype, eps, device="npu") + else: + if sp_enable: + model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu") + else: + model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu") + model = model.to("npu") - x = torch.rand(num_tokens, - hidden_size, - device="npu", - dtype=dtype, - requires_grad=False) + x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False) - result_unfused = model(x) - print("Unfused result:", [t.shape for t in result_unfused]) - model_fused = torch.compile(model, backend=backend) - result_fused = model_fused(x) - print("Fused result:", [t.shape for t in result_fused]) + result_unfused = model(x) + print("Unfused result:", [t.shape for t in result_unfused]) + model_fused = torch.compile(model, backend=backend) + result_fused = model_fused(x) + print("Fused result:", [t.shape for t in result_fused]) - print("=== Checking operator fusion ===") - backend.check_before_ops(model.ops_in_model_before(), - fully_replaced=not sp_enable) - backend.check_after_ops(model.ops_in_model_after()) + print("=== Checking operator fusion ===") + backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable) + backend.check_after_ops(model.ops_in_model_after()) diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py index 672cd274..dc019a8b 100644 --- a/tests/e2e/singlecard/model_runner_v2/test_basic.py +++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py @@ -47,9 +47,9 @@ def test_qwen3_dense_eager_mode( sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) with VllmRunner( - model, - max_model_len=1024, - enforce_eager=enforce_eager, + model, + max_model_len=1024, + enforce_eager=enforce_eager, ) as runner: runner.model.generate(prompts, sampling_params) @@ -74,14 +74,14 @@ def test_egale_spec_decoding( sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) with VllmRunner( - model, - max_model_len=1024, - enforce_eager=enforce_eager, - async_scheduling=True, - speculative_config={ - "model": eagle_model, - "method": "eagle", - "num_speculative_tokens": 3, - }, + model, + max_model_len=1024, + enforce_eager=enforce_eager, + async_scheduling=True, + speculative_config={ + "model": eagle_model, + "method": "eagle", + "num_speculative_tokens": 3, + }, ) as runner: runner.model.generate(prompts, sampling_params) diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index ac5c0de8..e031e93f 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -15,20 +15,22 @@ # limitations under the License. # -import pytest +# ruff: noqa: E501 + import os -from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT, - LLMTestCase, gen_and_valid) +import pytest + +from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid CASE_QWEN_ACLGRAPH = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_SHORT, golden_answers=[ " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', - ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of', - ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and' + " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", + " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of", + " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", ], ) @@ -37,10 +39,10 @@ CASE_DS_ACLGRAPH = LLMTestCase( quantization="ascend", prompts=PROMPTS_SHORT, golden_answers=[ - '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2', - ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the', - ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art', - ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of' + "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2", + " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the", + " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art", + " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of", ], ) @@ -49,9 +51,9 @@ CASE_QWEN_FULL = LLMTestCase( prompts=PROMPTS_SHORT, golden_answers=[ " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', - ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of', - ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and' + " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", + " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of", + " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", ], ) @@ -60,10 +62,10 @@ CASE_DS_FULL = LLMTestCase( quantization="ascend", prompts=PROMPTS_SHORT, golden_answers=[ - '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2', - ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the', - ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art', - ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of' + "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2", + " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the", + " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art", + " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of", ], ) @@ -71,10 +73,11 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_LONG, golden_answers=[ - ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the', + " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the", " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over", - ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can' - ]) + " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can", + ], +) CASE_DS_FULL_DECODE_ONLY = LLMTestCase( model="vllm-ascend/DeepSeek-V2-Lite-W8A8", @@ -83,26 +86,31 @@ CASE_DS_FULL_DECODE_ONLY = LLMTestCase( golden_answers=[ "\n\nSelect an assignment template", "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use", - "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations" - ]) + "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations", + ], +) CASE_QWEN_EX = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=PROMPTS_LONG, golden_answers=[ - ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the', + " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the", " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over", - ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can' - ]) + " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can", + ], +) + +CASE_DS_EX = LLMTestCase( + model="vllm-ascend/DeepSeek-V2-Lite-W8A8", + quantization="ascend", + prompts=PROMPTS_LONG, + golden_answers=[ + "\n\nSelect an assignment template", + "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use", + "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations", + ], +) -CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8", - quantization="ascend", - prompts=PROMPTS_LONG, - golden_answers=[ - "\n\nSelect an assignment template", - "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use", - "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations" - ]) @pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH]) def test_piecewise_res_consistency(cur_case: LLMTestCase): @@ -112,51 +120,48 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase): "cudagraph_capture_sizes": [1, 2, 4, 8], "quantization": cur_case.quantization, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) -@pytest.mark.parametrize( - "cur_case", [CASE_QWEN_FULL, CASE_DS_FULL]) + +@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL]) def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) runner_kwargs = { "model_name": cur_case.model, "max_model_len": 1024, - "compilation_config": { - "cudagraph_capture_sizes": [4, 8, 32, 64], - "cudagraph_mode": "FULL_DECODE_ONLY" - }, + "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"}, "quantization": cur_case.quantization, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) -@pytest.mark.parametrize( - "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY]) + +@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY]) def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch): monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False) runner_kwargs = { "model_name": cur_case.model, "max_model_len": 1024, - "compilation_config": { - "cudagraph_capture_sizes": [4, 8, 32, 64], - "cudagraph_mode": "FULL_DECODE_ONLY" - }, + "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"}, "quantization": cur_case.quantization, - "additional_config": { - "npugraph_ex_config": { - "enable": False - } - }, + "additional_config": {"npugraph_ex_config": {"enable": False}}, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) + @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX]) def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): @@ -165,20 +170,16 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch): "model_name": cur_case.model, "quantization": cur_case.quantization, "max_model_len": 1024, - "compilation_config": { - "cudagraph_capture_sizes": [4, 8, 32, 64], - "cudagraph_mode": "FULL_DECODE_ONLY" - }, - "additional_config": { - "npugraph_ex_config": { - "enable": True - } - }, + "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"}, + "additional_config": {"npugraph_ex_config": {"enable": True}}, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) + # The accuracy has already been verified in the previous test case. # This test case is used to check whether the functionality works properly @@ -190,10 +191,7 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch): "model_name": cur_case.model, "quantization": cur_case.quantization, "max_model_len": 1024, - "compilation_config": { - "cudagraph_capture_sizes": [4, 8], - "cudagraph_mode": "FULL_DECODE_ONLY" - }, + "compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"}, "additional_config": { "npugraph_ex_config": { "enable": True, @@ -201,12 +199,14 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch): } }, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) # Check whether the static kernel is properly uninstall ascend_home_path = os.environ["ASCEND_HOME_PATH"] - static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core') + static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core") assert not os.path.exists(static_kernel_install_path) diff --git a/tests/e2e/singlecard/test_aclgraph_batch_invariant.py b/tests/e2e/singlecard/test_aclgraph_batch_invariant.py index 048400c8..47413ba4 100644 --- a/tests/e2e/singlecard/test_aclgraph_batch_invariant.py +++ b/tests/e2e/singlecard/test_aclgraph_batch_invariant.py @@ -22,6 +22,7 @@ import random import pytest import torch from vllm import SamplingParams + from tests.e2e.conftest import VllmRunner DEFAULT_MODEL = "Qwen/Qwen3-0.6B" @@ -69,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: if target_words > 50: # For longer prompts, repeat context - padding_text = ( - " This is an interesting topic that deserves more explanation. " * - (target_words // 50)) + padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50) base_prompt = base_prompt + padding_text return base_prompt @@ -107,8 +106,7 @@ def _extract_step_logprobs(generate_output): @pytest.mark.timeout(1000) -def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle( - monkeypatch: pytest.MonkeyPatch): +def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch): """ Ensures that the same request (the 'needle' prompt) yields identical output whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64), @@ -162,20 +160,16 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle( needle_prompt = "There once was a " with VllmRunner( - model_name=model, - max_num_seqs=max_batch_size, - gpu_memory_utilization=gpu_mem_util, - max_model_len=max_model_len, - dtype="bfloat16", - tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), - enable_prefix_caching=False, - distributed_executor_backend="mp", - compilation_config={ - "cudagraph_mode": "FULL_DECODE_ONLY", - "cudagraph_capture_sizes": [1, 32, 64] - } + model_name=model, + max_num_seqs=max_batch_size, + gpu_memory_utilization=gpu_mem_util, + max_model_len=max_model_len, + dtype="bfloat16", + tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), + enable_prefix_caching=False, + distributed_executor_backend="mp", + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]}, ) as vllm_model: - # Baseline generation for the needle prompt alone. baseline_out = vllm_model.generate([needle_prompt], sampling) assert len(baseline_out) == 1 @@ -194,8 +188,7 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle( if i == needle_pos: prompts.append(needle_prompt) else: - prompts.append( - _random_prompt(min_random_prompt, max_random_prompt)) + prompts.append(_random_prompt(min_random_prompt, max_random_prompt)) # Generate with the larger-batch engine outputs = vllm_model.generate(prompts, sampling) @@ -204,24 +197,23 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle( text = needle_output[0] if text != baseline_text: - print( - f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n") + print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n") mismatches += 1 passes = num_trials - mismatches # Dump how many passed vs failed - print(f"[determinism] total={num_trials}, passed={passes}, " - f"failed={mismatches}, max_batch_size={max_batch_size}") + print( + f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}" + ) if mismatches > 0: pytest.fail( f"Nondeterministic outputs detected: {mismatches} failed out " - f"of {num_trials} trials (max_batch_size={max_batch_size}).") + f"of {num_trials} trials (max_batch_size={max_batch_size})." + ) - -def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - monkeypatch: pytest.MonkeyPatch): +def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch): seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = DEFAULT_MODEL @@ -235,24 +227,19 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN( if disable_custom_ar: print(f"\n{'=' * 80}") - print( - f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})" - ) + print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})") print(f"{'=' * 80}\n") with VllmRunner( - model_name=model_name, - tensor_parallel_size=tp_size, - enable_prefix_caching=False, - max_num_seqs=32, - max_model_len=8192, - dtype="bfloat16", - gpu_memory_utilization=0.9, - distributed_executor_backend="mp", - compilation_config={ - "cudagraph_mode": "FULL_DECODE_ONLY", - "cudagraph_capture_sizes": [1, 32, 64] - } + model_name=model_name, + tensor_parallel_size=tp_size, + enable_prefix_caching=False, + max_num_seqs=32, + max_model_len=8192, + dtype="bfloat16", + gpu_memory_utilization=0.9, + distributed_executor_backend="mp", + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]}, ) as vllm_model: # Use more realistic prompts for better token generation prompts = [_random_prompt(10, 50) for i in range(32)] @@ -273,16 +260,13 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN( bs1_logprobs_per_prompt = [] bs1_tokens_per_prompt = [] for idx, p in enumerate(prompts): - print( - f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..." - ) + print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...") outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False) assert len(outs) == 1 # print(outs) step_logprobs, token_ids = _extract_step_logprobs(outs[0]) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bs1_logprobs_per_prompt.append(step_logprobs) bs1_tokens_per_prompt.append(token_ids) print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") @@ -304,108 +288,91 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN( print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") step_logprobs, token_ids = _extract_step_logprobs(o) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bsN_logprobs_per_prompt.append(step_logprobs) bsN_tokens_per_prompt.append(token_ids) # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. failed_prompts = [] for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( - zip( - bs1_logprobs_per_prompt, - bsN_logprobs_per_prompt, - bs1_tokens_per_prompt, - bsN_tokens_per_prompt, - )): + zip( + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + ) + ): if len(logprobs_bs1) != len(logprobs_bsN): - reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " - f"vs {len(logprobs_bsN)} (BS=N)") - failed_prompts.append({ - "prompt_idx": i, - "step": "all", - "reason": reason, - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)" + failed_prompts.append( + { + "prompt_idx": i, + "step": "all", + "reason": reason, + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue # Check if tokens match first if tokens_bs1 != tokens_bsN: - failed_prompts.append({ - "prompt_idx": - i, - "step": - "sampling", - "reason": - "Different tokens sampled", - "prompt_preview": - prompts[i][:100], - "bs1_tokens": - tokens_bs1, - "bsN_tokens": - tokens_bsN, - "bs1_all_logprobs": - [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], - "bsN_all_logprobs": - [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": "sampling", + "reason": "Different tokens sampled", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], + "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], + } + ) continue for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): if a.shape != b.shape: - failed_prompts.append({ - "prompt_idx": i, - "step": t, - "reason": f"Shape mismatch: {a.shape} vs {b.shape}", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Shape mismatch: {a.shape} vs {b.shape}", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break if not torch.equal(a, b): max_diff = torch.abs(a - b).max().item() # Print which token failed - print( - f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}" - ) + print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}") bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" BS=1 logprob: {a.tolist()}") print(f" BS=N logprob: {b.tolist()}") - failed_prompts.append({ - "prompt_idx": - i, - "step": - t, - "reason": - f"Bitwise mismatch (max_diff={max_diff:.6e})", - "prompt_preview": - prompts[i][:100], - "bs1_tokens": - tokens_bs1, - "bsN_tokens": - tokens_bsN, - "bs1_all_logprobs": [ - logprobs_bs1[s].tolist() - for s in range(len(logprobs_bs1)) - ], - "bsN_all_logprobs": [ - logprobs_bsN[s].tolist() - for s in range(len(logprobs_bsN)) - ], - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], + "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], + } + ) break - # Print summary of all failures if failed_prompts: print(f"\n{'=' * 80}") - fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/" - f"{len(prompts)} prompts failed") + fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed" print(fail_msg) print(f"{'=' * 80}") for fail in failed_prompts: @@ -420,21 +387,18 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN( print(f" BS=N tokens: {fail['bsN_tokens']}") if "bs1_all_logprobs" in fail: - print( - f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:" - ) + print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:") for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]): print(f" Step {step_idx}: {logprobs}") - print( - f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:" - ) + print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:") for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]): print(f" Step {step_idx}: {logprobs}") print(f"{'=' * 80}\n") # Fail the test with summary - msg = (f"Batch invariance violated in {len(failed_prompts)}/" - f"{len(prompts)} prompts. See output above for details.") + msg = ( + f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details." + ) pytest.fail(msg) @@ -446,18 +410,15 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch): model = DEFAULT_MODEL with VllmRunner( - model_name=model, - max_num_seqs=1, - tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), - gpu_memory_utilization=0.9, - max_model_len=2048, - dtype="float16", - enable_prefix_caching=False, - compilation_config={ - "cudagraph_mode": "FULL_DECODE_ONLY", - "cudagraph_capture_sizes": [1, 32, 64] - }, - distributed_executor_backend="mp", + model_name=model, + max_num_seqs=1, + tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), + gpu_memory_utilization=0.9, + max_model_len=2048, + dtype="float16", + enable_prefix_caching=False, + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]}, + distributed_executor_backend="mp", ) as vllm_model: prompt = "The capital of France is" sampling_params = SamplingParams( @@ -479,11 +440,7 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch): print(f"{'=' * 80}\n") - - - -def test_aclgraph_logprobs_without_batch_invariance_should_fail( - monkeypatch: pytest.MonkeyPatch): +def test_aclgraph_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch): """ This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN. It DISABLES batch invariance mode and expects to see non-deterministic behavior @@ -505,19 +462,15 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail( print(f"{'=' * 80}\n") with VllmRunner( - model_name=model_name, - tensor_parallel_size=tp_size, - enable_prefix_caching=False, - max_num_seqs=32, - max_model_len=8192, - dtype="bfloat16", - compilation_config={ - "cudagraph_mode": "FULL_DECODE_ONLY", - "cudagraph_capture_sizes": [1, 32, 64] - }, - distributed_executor_backend="mp", + model_name=model_name, + tensor_parallel_size=tp_size, + enable_prefix_caching=False, + max_num_seqs=32, + max_model_len=8192, + dtype="bfloat16", + compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]}, + distributed_executor_backend="mp", ) as vllm_model: - # build ragged prompts to change shapes significantly across BS=1 vs BS=N long_min = int(os.getenv("VLLM_MIN_PROMPT", "768")) long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048")) @@ -549,16 +502,13 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail( bs1_logprobs_per_prompt = [] bs1_tokens_per_prompt = [] for idx, p in enumerate(prompts): - print( - f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..." - ) + print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...") outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False) assert len(outs) == 1 step_logprobs, token_ids = _extract_step_logprobs(outs[0]) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bs1_logprobs_per_prompt.append(step_logprobs) bs1_tokens_per_prompt.append(token_ids) print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") @@ -579,84 +529,90 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail( print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") step_logprobs, token_ids = _extract_step_logprobs(o) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bsN_logprobs_per_prompt.append(step_logprobs) bsN_tokens_per_prompt.append(token_ids) # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. differences_found = [] for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( - zip( - bs1_logprobs_per_prompt, - bsN_logprobs_per_prompt, - bs1_tokens_per_prompt, - bsN_tokens_per_prompt, - )): + zip( + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + ) + ): if len(logprobs_bs1) != len(logprobs_bsN): - reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " - f"vs {len(logprobs_bsN)} (BS=N)") - differences_found.append({ - "prompt_idx": i, - "step": "all", - "reason": reason, - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)" + differences_found.append( + { + "prompt_idx": i, + "step": "all", + "reason": reason, + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue # Check if tokens match first if tokens_bs1 != tokens_bsN: - differences_found.append({ - "prompt_idx": i, - "step": "sampling", - "reason": "Different tokens sampled", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": "sampling", + "reason": "Different tokens sampled", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): if a.shape != b.shape: - differences_found.append({ - "prompt_idx": i, - "step": t, - "reason": f"Shape mismatch: {a.shape} vs {b.shape}", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Shape mismatch: {a.shape} vs {b.shape}", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break if not torch.equal(a, b): max_diff = torch.abs(a - b).max().item() - print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " - f"Token {t}: max_diff={max_diff:.6e}") + print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}") bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" BS=1 logprob: {a.tolist()}") print(f" BS=N logprob: {b.tolist()}") - differences_found.append({ - "prompt_idx": i, - "step": t, - "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break - # Print summary print(f"\n{'=' * 80}") if differences_found: success_msg = ( f"✓ SUCCESS: Batch invariance is doing something! " f"Found {len(differences_found)}/{len(prompts)} prompts " - f"with differences when batch invariance was DISABLED.") + f"with differences when batch invariance was DISABLED." + ) print(success_msg) print(f"{'=' * 80}") for diff in differences_found: @@ -676,7 +632,8 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail( f"✗ UNEXPECTED: All {len(prompts)} prompts matched " f"between BS=1 and BS=N even with batch invariance DISABLED. " f"This suggests batch invariance might not be necessary, " - f"or the test needs more sensitive prompts.") + f"or the test needs more sensitive prompts." + ) print(fail_msg) print(f"{'=' * 80}\n") pytest.fail(fail_msg) diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py index 25d09786..ff73b168 100644 --- a/tests/e2e/singlecard/test_aclgraph_mem.py +++ b/tests/e2e/singlecard/test_aclgraph_mem.py @@ -40,7 +40,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None: capture_mem_after = multiprocessing.Value("q", -1) # long long def capture_model_wrapper(original_method): - def wrapped(self): mem_before = torch.npu.mem_get_info()[0] # free memory result = original_method(self) @@ -55,19 +54,16 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None: original_capture = NPUModelRunner.capture_model - with patch.object(NPUModelRunner, - 'capture_model', - new=capture_model_wrapper(original_capture)): + with patch.object(NPUModelRunner, "capture_model", new=capture_model_wrapper(original_capture)): prompts = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ] - sampling_params = SamplingParams(max_tokens=max_tokens, - temperature=0.0) + sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": - vllm_model = VllmRunner(model, - max_model_len=1024, - quantization="ascend") + vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend") else: vllm_model = VllmRunner(model) _ = vllm_model.generate(prompts, sampling_params) @@ -94,5 +90,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None: assert mem_used_by_capture < max_mem_expected, ( f"capture_model used more memory than expected. " f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, " - f"Expected: < {max_capture_mem_gib:.2f} GiB") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn' + f"Expected: < {max_capture_mem_gib:.2f} GiB" + ) + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/tests/e2e/singlecard/test_async_scheduling.py b/tests/e2e/singlecard/test_async_scheduling.py index b8d53b84..e815d90c 100644 --- a/tests/e2e/singlecard/test_async_scheduling.py +++ b/tests/e2e/singlecard/test_async_scheduling.py @@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal MODEL = "Qwen/Qwen3-0.6B" MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16" -first_prompt = ("The following numbers of the sequence " + - ", ".join(str(i) for i in range(10)) + " are:") +first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:" example_prompts = [ "Hello, my name is", "The president of the United States is", @@ -31,7 +30,9 @@ default_params = dict( ) -def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ): +def test_without_spec_decoding( + monkeypatch: pytest.MonkeyPatch, +): """Test consistency of combos of async scheduling, preemption, uni/multiproc executor, prefill chunking.""" test_sampling_params: list[dict[str, Any]] = [ @@ -85,11 +86,11 @@ def run_tests( # avoid precision errors outputs: list[tuple[str, list, list]] = [] for n, ( - test_preemption, - executor, - async_scheduling, - spec_config, - test_prefill_chunking, + test_preemption, + executor, + async_scheduling, + spec_config, + test_prefill_chunking, ) in enumerate(test_configs, 1): test_str = f"{n}/{len(test_configs)}" test_results = run_test( @@ -105,21 +106,18 @@ def run_tests( outputs.append(test_results) baseline_config, baseline_tests, _ = outputs[0] - _, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), - (None, None, None)) + _, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None)) - print( - f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}" - ) + print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}") failure = None for test_config, test_outputs, test_acceptance_rates in outputs[1:]: for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip( - baseline_tests, - baseline_acceptances or repeat(None), - test_outputs, - test_acceptance_rates or repeat(None), - test_sampling_params, + baseline_tests, + baseline_acceptances or repeat(None), + test_outputs, + test_acceptance_rates or repeat(None), + test_sampling_params, ): try: check_outputs_equal( @@ -129,21 +127,18 @@ def run_tests( name_1=f"config=[{test_config}], params={params}", ) - if (base_acceptance_rate is not None - and test_acceptance_rate is not None): + if base_acceptance_rate is not None and test_acceptance_rate is not None: if "spec_mml=None" in test_config: - assert (test_acceptance_rate > base_acceptance_rate - or test_acceptance_rate == pytest.approx( - base_acceptance_rate, rel=5e-2)) + assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx( + base_acceptance_rate, rel=5e-2 + ) else: # Currently the reported acceptance rate is expected to be # lower when we sometimes skip drafting altogether. assert test_acceptance_rate > 0.1 - print(f"PASSED: config=[{test_config}], params={params}" - f" accept_rate={test_acceptance_rate}") + print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}") except AssertionError as e: - print(f"FAILED: config=[{test_config}], params={params}" - f" accept_rate={test_acceptance_rate}") + print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}") if failure is None: failure = e @@ -161,33 +156,35 @@ def run_test( spec_config: dict[str, Any] | None, test_prefill_chunking: bool, ): - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" spec_decoding = spec_config is not None cache_arg: dict[str, Any] = ( # Force preemptions - dict(num_gpu_blocks_override=2) if test_preemption else dict( - gpu_memory_utilization=0.9)) + dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9) + ) spec_mml = (spec_config or {}).get("max_model_len") - test_config = (f"executor={executor}, preemption={test_preemption}, " - f"async_sched={async_scheduling}, " - f"chunk_prefill={test_prefill_chunking}, " - f"spec_decoding={spec_decoding}, spec_mml={spec_mml}") + test_config = ( + f"executor={executor}, preemption={test_preemption}, " + f"async_sched={async_scheduling}, " + f"chunk_prefill={test_prefill_chunking}, " + f"spec_decoding={spec_decoding}, spec_mml={spec_mml}" + ) print("-" * 80) print(f"---- TESTING {test_str}: {test_config}") print("-" * 80) with VllmRunner( - model, - max_model_len=512, - enable_chunked_prefill=test_prefill_chunking, - # Force prefill chunking - max_num_batched_tokens=48 if test_prefill_chunking else None, - enforce_eager=True, - async_scheduling=async_scheduling, - distributed_executor_backend=executor, - dtype="float16", # avoid precision errors - speculative_config=spec_config, - disable_log_stats=False, - **cache_arg, + model, + max_model_len=512, + enable_chunked_prefill=test_prefill_chunking, + # Force prefill chunking + max_num_batched_tokens=48 if test_prefill_chunking else None, + enforce_eager=True, + async_scheduling=async_scheduling, + distributed_executor_backend=executor, + dtype="float16", # avoid precision errors + speculative_config=spec_config, + disable_log_stats=False, + **cache_arg, ) as vllm_model: results = [] acceptance_rates: list[float] | None = [] if spec_decoding else None @@ -197,26 +194,23 @@ def run_test( results.append( vllm_model.generate( example_prompts, - sampling_params=SamplingParams(**default_params, - **override_params), - )) + sampling_params=SamplingParams(**default_params, **override_params), + ) + ) metrics_after = vllm_model.model.get_metrics() if acceptance_rates is not None: - acceptance_rate = _get_acceptance_rate(metrics_before, - metrics_after) + acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after) acceptance_rates.append(acceptance_rate) print(f"ACCEPTANCE RATE {acceptance_rate}") if test_preemption: - preemptions = _get_count(metrics_before, metrics_after, - "vllm:num_preemptions") + preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions") assert preemptions > 0, "preemption test had no preemptions" if len(results) > 1: # First check that the different parameter configs # actually result in different output. - for other_test_outs, params in zip(results[1:], - sampling_param_tests[1:]): + for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]): with pytest.raises(AssertionError): check_outputs_equal( outputs_0_lst=results[0][0], diff --git a/tests/e2e/singlecard/test_auto_fit_max_mode_len.py b/tests/e2e/singlecard/test_auto_fit_max_mode_len.py index a576f132..814147ab 100644 --- a/tests/e2e/singlecard/test_auto_fit_max_mode_len.py +++ b/tests/e2e/singlecard/test_auto_fit_max_mode_len.py @@ -42,6 +42,7 @@ def new_kv_cache_spec( attention_chunk_size=attention_chunk_size, ) + def test_auto_fit_max_model_len(): """Test that max_model_len=-1 auto-fits to available NPU memory.""" # Create config with original_max_model_len=-1 to trigger auto-fit @@ -59,9 +60,7 @@ def test_auto_fit_max_model_len(): # With enough memory, max_model_len stays at the derived max large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory - _kv_cache_configs = get_kv_cache_configs( - vllm_config, [kv_cache_specs], [large_available_memory] - ) + _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory]) assert vllm_config.model_config.max_model_len == 1024 # Reset for next test @@ -73,9 +72,7 @@ def test_auto_fit_max_model_len(): # Need memory for at least max_model_len tokens # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens limited_memory = mem_per_block_per_layer * 2 * 32 - _kv_cache_configs = get_kv_cache_configs( - vllm_config, [kv_cache_specs], [limited_memory] - ) + _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [limited_memory]) # Should be reduced to fit in memory assert vllm_config.model_config.max_model_len < 1024 assert vllm_config.model_config.max_model_len > 0 @@ -94,7 +91,5 @@ def test_auto_fit_max_model_len_not_triggered(): } # This should work normally without auto-fit - _kv_cache_configs = get_kv_cache_configs( - vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32] - ) + _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]) assert vllm_config.model_config.max_model_len == 16 diff --git a/tests/e2e/singlecard/test_batch_invariant.py b/tests/e2e/singlecard/test_batch_invariant.py index d4fd423c..50a86cdb 100644 --- a/tests/e2e/singlecard/test_batch_invariant.py +++ b/tests/e2e/singlecard/test_batch_invariant.py @@ -70,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: if target_words > 50: # For longer prompts, repeat context - padding_text = ( - " This is an interesting topic that deserves more explanation. " * - (target_words // 50)) + padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50) base_prompt = base_prompt + padding_text return base_prompt @@ -83,10 +81,7 @@ def _extract_step_logprobs(request_output): inner = request_output.outputs[0] if hasattr(inner, "logprobs") and inner.logprobs is not None: t = torch.tensor( - [ - inner.logprobs[i][tid].logprob - for i, tid in enumerate(inner.token_ids) - ], + [inner.logprobs[i][tid].logprob for i, tid in enumerate(inner.token_ids)], dtype=torch.float32, ) return t, inner.token_ids @@ -95,8 +90,7 @@ def _extract_step_logprobs(request_output): @pytest.mark.timeout(1000) -def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( - monkeypatch: pytest.MonkeyPatch): +def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch): """ Ensures that the same request (the 'needle' prompt) yields identical output whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64), @@ -184,8 +178,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( if i == needle_pos: prompts.append(needle_prompt) else: - prompts.append( - _random_prompt(min_random_prompt, max_random_prompt)) + prompts.append(_random_prompt(min_random_prompt, max_random_prompt)) # Generate with the larger-batch engine outputs = llm.generate(prompts, sampling) @@ -196,27 +189,27 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( text = needle_output.outputs[0].text if text != baseline_text: - print( - f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n") + print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n") mismatches += 1 passes = num_trials - mismatches # Dump how many passed vs failed - print(f"[determinism] total={num_trials}, passed={passes}, " - f"failed={mismatches}, max_batch_size={max_batch_size}") + print( + f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}" + ) if mismatches > 0: pytest.fail( f"Nondeterministic outputs detected: {mismatches} failed out " - f"of {num_trials} trials (max_batch_size={max_batch_size}).") + f"of {num_trials} trials (max_batch_size={max_batch_size})." + ) finally: del llm cleanup_dist_env_and_memory() -def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - monkeypatch: pytest.MonkeyPatch): +def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch): seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = DEFAULT_MODEL @@ -230,9 +223,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( if disable_custom_ar: print(f"\n{'=' * 80}") - print( - f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})" - ) + print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})") print(f"{'=' * 80}\n") llm = LLM( @@ -266,15 +257,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( bs1_logprobs_per_prompt = [] bs1_tokens_per_prompt = [] for idx, p in enumerate(prompts): - print( - f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..." - ) + print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...") outs = llm.generate([p], sp, use_tqdm=False) assert len(outs) == 1 step_logprobs, token_ids = _extract_step_logprobs(outs[0]) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bs1_logprobs_per_prompt.append(step_logprobs) bs1_tokens_per_prompt.append(token_ids) print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") @@ -296,108 +284,92 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") step_logprobs, token_ids = _extract_step_logprobs(o) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bsN_logprobs_per_prompt.append(step_logprobs) bsN_tokens_per_prompt.append(token_ids) # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. failed_prompts = [] for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( - zip( - bs1_logprobs_per_prompt, - bsN_logprobs_per_prompt, - bs1_tokens_per_prompt, - bsN_tokens_per_prompt, - )): + zip( + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + ) + ): if len(logprobs_bs1) != len(logprobs_bsN): - reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " - f"vs {len(logprobs_bsN)} (BS=N)") - failed_prompts.append({ - "prompt_idx": i, - "step": "all", - "reason": reason, - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)" + failed_prompts.append( + { + "prompt_idx": i, + "step": "all", + "reason": reason, + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue # Check if tokens match first if tokens_bs1 != tokens_bsN: - failed_prompts.append({ - "prompt_idx": - i, - "step": - "sampling", - "reason": - "Different tokens sampled", - "prompt_preview": - prompts[i][:100], - "bs1_tokens": - tokens_bs1, - "bsN_tokens": - tokens_bsN, - "bs1_all_logprobs": - [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], - "bsN_all_logprobs": - [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": "sampling", + "reason": "Different tokens sampled", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], + "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], + } + ) continue for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): if a.shape != b.shape: - failed_prompts.append({ - "prompt_idx": i, - "step": t, - "reason": f"Shape mismatch: {a.shape} vs {b.shape}", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Shape mismatch: {a.shape} vs {b.shape}", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break if not torch.equal(a, b): max_diff = torch.abs(a - b).max().item() # Print which token failed - print( - f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}" - ) + print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}") bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" BS=1 logprob: {a.tolist()}") print(f" BS=N logprob: {b.tolist()}") - failed_prompts.append({ - "prompt_idx": - i, - "step": - t, - "reason": - f"Bitwise mismatch (max_diff={max_diff:.6e})", - "prompt_preview": - prompts[i][:100], - "bs1_tokens": - tokens_bs1, - "bsN_tokens": - tokens_bsN, - "bs1_all_logprobs": [ - logprobs_bs1[s].tolist() - for s in range(len(logprobs_bs1)) - ], - "bsN_all_logprobs": [ - logprobs_bsN[s].tolist() - for s in range(len(logprobs_bsN)) - ], - }) + failed_prompts.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))], + "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))], + } + ) break del llm cleanup_dist_env_and_memory() # Print summary of all failures if failed_prompts: print(f"\n{'=' * 80}") - fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/" - f"{len(prompts)} prompts failed") + fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed" print(fail_msg) print(f"{'=' * 80}") for fail in failed_prompts: @@ -412,21 +384,18 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( print(f" BS=N tokens: {fail['bsN_tokens']}") if "bs1_all_logprobs" in fail: - print( - f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:" - ) + print(f" BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:") for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]): print(f" Step {step_idx}: {logprobs}") - print( - f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:" - ) + print(f" BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:") for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]): print(f" Step {step_idx}: {logprobs}") print(f"{'=' * 80}\n") # Fail the test with summary - msg = (f"Batch invariance violated in {len(failed_prompts)}/" - f"{len(prompts)} prompts. See output above for details.") + msg = ( + f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details." + ) pytest.fail(msg) @@ -476,8 +445,7 @@ def test_simple_generation(monkeypatch: pytest.MonkeyPatch): cleanup_dist_env_and_memory() -def test_logprobs_without_batch_invariance_should_fail( - monkeypatch: pytest.MonkeyPatch): +def test_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch): """ This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN. It DISABLES batch invariance mode and expects to see non-deterministic behavior @@ -540,15 +508,12 @@ def test_logprobs_without_batch_invariance_should_fail( bs1_logprobs_per_prompt = [] bs1_tokens_per_prompt = [] for idx, p in enumerate(prompts): - print( - f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..." - ) + print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...") outs = llm.generate([p], sp, use_tqdm=False) assert len(outs) == 1 step_logprobs, token_ids = _extract_step_logprobs(outs[0]) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bs1_logprobs_per_prompt.append(step_logprobs) bs1_tokens_per_prompt.append(token_ids) print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}") @@ -569,74 +534,80 @@ def test_logprobs_without_batch_invariance_should_fail( print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}") step_logprobs, token_ids = _extract_step_logprobs(o) if step_logprobs is None: - pytest.skip("Logits are not available on RequestOutput; " - "enable logprobs return to run this test.") + pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.") bsN_logprobs_per_prompt.append(step_logprobs) bsN_tokens_per_prompt.append(token_ids) # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs. differences_found = [] for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate( - zip( - bs1_logprobs_per_prompt, - bsN_logprobs_per_prompt, - bs1_tokens_per_prompt, - bsN_tokens_per_prompt, - )): + zip( + bs1_logprobs_per_prompt, + bsN_logprobs_per_prompt, + bs1_tokens_per_prompt, + bsN_tokens_per_prompt, + ) + ): if len(logprobs_bs1) != len(logprobs_bsN): - reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) " - f"vs {len(logprobs_bsN)} (BS=N)") - differences_found.append({ - "prompt_idx": i, - "step": "all", - "reason": reason, - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)" + differences_found.append( + { + "prompt_idx": i, + "step": "all", + "reason": reason, + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue # Check if tokens match first if tokens_bs1 != tokens_bsN: - differences_found.append({ - "prompt_idx": i, - "step": "sampling", - "reason": "Different tokens sampled", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": "sampling", + "reason": "Different tokens sampled", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) continue for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)): if a.shape != b.shape: - differences_found.append({ - "prompt_idx": i, - "step": t, - "reason": f"Shape mismatch: {a.shape} vs {b.shape}", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Shape mismatch: {a.shape} vs {b.shape}", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break if not torch.equal(a, b): max_diff = torch.abs(a - b).max().item() - print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, " - f"Token {t}: max_diff={max_diff:.6e}") + print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}") bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A" bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A" print(f" Token IDs: bs1={bs1_tok}, bsN={bsN_tok}") print(f" BS=1 logprob: {a.tolist()}") print(f" BS=N logprob: {b.tolist()}") - differences_found.append({ - "prompt_idx": i, - "step": t, - "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", - "prompt_preview": prompts[i][:100], - "bs1_tokens": tokens_bs1, - "bsN_tokens": tokens_bsN, - }) + differences_found.append( + { + "prompt_idx": i, + "step": t, + "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})", + "prompt_preview": prompts[i][:100], + "bs1_tokens": tokens_bs1, + "bsN_tokens": tokens_bsN, + } + ) break del llm cleanup_dist_env_and_memory() @@ -646,7 +617,8 @@ def test_logprobs_without_batch_invariance_should_fail( success_msg = ( f"✓ SUCCESS: Batch invariance is doing something! " f"Found {len(differences_found)}/{len(prompts)} prompts " - f"with differences when batch invariance was DISABLED.") + f"with differences when batch invariance was DISABLED." + ) print(success_msg) print(f"{'=' * 80}") for diff in differences_found: @@ -666,7 +638,8 @@ def test_logprobs_without_batch_invariance_should_fail( f"✗ UNEXPECTED: All {len(prompts)} prompts matched " f"between BS=1 and BS=N even with batch invariance DISABLED. " f"This suggests batch invariance might not be necessary, " - f"or the test needs more sensitive prompts.") + f"or the test needs more sensitive prompts." + ) print(fail_msg) print(f"{'=' * 80}\n") pytest.fail(fail_msg) diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index 5bdf68b7..db9762ae 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -37,10 +37,7 @@ def test_end_to_end(): prompt = "How are you?" sampling_params = SamplingParams(temperature=0, max_tokens=10) - with VllmRunner("Qwen/Qwen3-0.6B", - enable_sleep_mode=True, - cudagraph_capture_sizes=[1, 2, 4, 8]) as runner: - + with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True, cudagraph_capture_sizes=[1, 2, 4, 8]) as runner: output = runner.model.generate(prompt, sampling_params) # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # which is difficult to measure in the test. therefore, we only diff --git a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py index 0e8ececa..2d6993f6 100644 --- a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py +++ b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py @@ -30,9 +30,7 @@ MODELS = ["Qwen/Qwen3-0.6B"] def get_prompt_embeds(chat, tokenizer, embedding_layer): """Convert chat messages to prompt embeddings.""" - token_ids = tokenizer.apply_chat_template(chat, - add_generation_prompt=True, - return_tensors='pt') + token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt") prompt_embeds = embedding_layer(token_ids).squeeze(0) return prompt_embeds @@ -53,15 +51,16 @@ def test_mixed_prompt_embeds_and_text(model_name): # Run inference with mixed inputs with VllmRunner( - model_name, - enable_prompt_embeds=True, - cudagraph_capture_sizes=[1, 2, 4, 8], + model_name, + enable_prompt_embeds=True, + cudagraph_capture_sizes=[1, 2, 4, 8], ) as vllm_runner: # Test prompt embeddings - embeds_output = vllm_runner.model.generate({ - "prompt_embeds": - prompt_embeds, - }) + embeds_output = vllm_runner.model.generate( + { + "prompt_embeds": prompt_embeds, + } + ) # Test text prompt text_output = vllm_runner.model.generate(text_prompt) diff --git a/tests/e2e/singlecard/test_cpu_offloading.py b/tests/e2e/singlecard/test_cpu_offloading.py index e51a70d9..61b15597 100644 --- a/tests/e2e/singlecard/test_cpu_offloading.py +++ b/tests/e2e/singlecard/test_cpu_offloading.py @@ -107,15 +107,13 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber): def _accuracy_test(llm: LLM, subscriber: MockSubscriber): sampling_params = SamplingParams(max_tokens=1) - cpu_block_size = (llm.llm_engine.vllm_config.kv_transfer_config. - kv_connector_extra_config["block_size"]) + cpu_block_size = llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config["block_size"] subscriber.get_new_cpu_stored_events() # prepend prompt to be cpu block aligned prompt = "Let's count to 10. One, two, three, four," - while (len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % - cpu_block_size != 0): + while len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size != 0: prompt = ". " + prompt assert subscriber.get_new_cpu_stored_events() @@ -123,8 +121,7 @@ def _accuracy_test(llm: LLM, subscriber: MockSubscriber): test_count = 100 success_count = 0 for i in range(test_count): - if (llm.generate(prompt, sampling_params, - use_tqdm=False)[0].outputs[0].text == " five"): + if llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text == " five": success_count += 1 assert success_count >= 0.5 * test_count @@ -143,7 +140,7 @@ def test_cpu_offloading() -> None: "num_cpu_blocks": 1000, "block_size": 128, "spec_name": "NPUOffloadingSpec", - "spec_module_path": "vllm_ascend.kv_offload.npu" + "spec_module_path": "vllm_ascend.kv_offload.npu", }, ) diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py index 8b1d83a8..fb1546ce 100644 --- a/tests/e2e/singlecard/test_guided_decoding.py +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -17,7 +17,7 @@ # limitations under the License. # import json -from typing import Any, Dict +from typing import Any import jsonschema import pytest @@ -34,8 +34,10 @@ GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"] @pytest.fixture(scope="module") def sample_regex(): - return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + return ( + r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" + ) @pytest.fixture(scope="module") @@ -43,66 +45,41 @@ def sample_json_schema(): return { "type": "object", "properties": { - "name": { - "type": "string" - }, - "age": { - "type": "integer" - }, - "skills": { - "type": "array", - "items": { - "type": "string", - "maxLength": 10 - }, - "minItems": 3 - }, + "name": {"type": "string"}, + "age": {"type": "integer"}, + "skills": {"type": "array", "items": {"type": "string", "maxLength": 10}, "minItems": 3}, "work_history": { "type": "array", "items": { "type": "object", "properties": { - "company": { - "type": "string" - }, - "duration": { - "type": "number" - }, - "position": { - "type": "string" - } + "company": {"type": "string"}, + "duration": {"type": "number"}, + "position": {"type": "string"}, }, - "required": ["company", "position"] - } - } + "required": ["company", "position"], + }, + }, }, - "required": ["name", "age", "skills", "work_history"] + "required": ["name", "age", "skills", "work_history"], } @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) -def test_guided_json_completion(guided_decoding_backend: str, - sample_json_schema): - runner_kwargs: Dict[str, Any] = {} +def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema): + runner_kwargs: dict[str, Any] = {} sampling_params = SamplingParams( - temperature=1.0, - max_tokens=500, - structured_outputs=StructuredOutputsParams(json=sample_json_schema)) + temperature=1.0, max_tokens=500, structured_outputs=StructuredOutputsParams(json=sample_json_schema) + ) runner_kwargs = { "cudagraph_capture_sizes": [1, 2, 4, 8], "seed": 0, - "structured_outputs_config": { - "backend": guided_decoding_backend - }, + "structured_outputs_config": {"backend": guided_decoding_backend}, } with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: - prompts = [ - f"Give an example JSON for an employee profile " - f"that fits this schema: {sample_json_schema}" - ] * 2 + prompts = [f"Give an example JSON for an employee profile that fits this schema: {sample_json_schema}"] * 2 inputs = vllm_model.get_inputs(prompts) - outputs = vllm_model.model.generate(inputs, - sampling_params=sampling_params) + outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params) assert outputs is not None @@ -115,34 +92,27 @@ def test_guided_json_completion(guided_decoding_backend: str, assert generated_text is not None print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, - schema=sample_json_schema) + jsonschema.validate(instance=output_json, schema=sample_json_schema) @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) def test_guided_regex(guided_decoding_backend: str, sample_regex): if guided_decoding_backend == "outlines": pytest.skip("Outlines doesn't support regex-based guided decoding.") - runner_kwargs: Dict[str, Any] = {} + runner_kwargs: dict[str, Any] = {} sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - structured_outputs=StructuredOutputsParams(regex=sample_regex)) + temperature=0.8, top_p=0.95, structured_outputs=StructuredOutputsParams(regex=sample_regex) + ) runner_kwargs = { "cudagraph_capture_sizes": [1, 2, 4, 8], "seed": 0, - "structured_outputs_config": { - "backend": guided_decoding_backend - }, + "structured_outputs_config": {"backend": guided_decoding_backend}, } with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model: - prompts = [ - f"Give an example IPv4 address with this regex: {sample_regex}" - ] * 2 + prompts = [f"Give an example IPv4 address with this regex: {sample_regex}"] * 2 inputs = vllm_model.get_inputs(prompts) - outputs = vllm_model.model.generate(inputs, - sampling_params=sampling_params) + outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params) assert outputs is not None for output in outputs: assert output is not None diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py index d59acd30..bf578cb0 100644 --- a/tests/e2e/singlecard/test_ilama_lora.py +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -19,20 +19,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( - query= - "What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 + query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 ), PROMPT_TEMPLATE.format( - query= - "What are all distinct countries where singers above age 20 are from?" # noqa: E501 + query="What are all distinct countries where singers above age 20 are from?" # noqa: E501 ), ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) + prompts, sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None + ) # Print the outputs. generated_texts: list[str] = [] for output in outputs: @@ -45,16 +41,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def test_ilama_lora(ilama_lora_files): with VllmRunner( - MODEL_PATH, - enable_lora=True, - dtype="half", - max_loras=4, - max_model_len=1024, - cudagraph_capture_sizes=[1, 2, 4, 8], - max_num_seqs=16, - enforce_eager=True, + MODEL_PATH, + enable_lora=True, + dtype="half", + max_loras=4, + max_model_len=1024, + cudagraph_capture_sizes=[1, 2, 4, 8], + max_num_seqs=16, + enforce_eager=True, ) as vllm_model: - output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): assert output1[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/e2e/singlecard/test_llama32_lora.py b/tests/e2e/singlecard/test_llama32_lora.py index ead2827e..ab7015b2 100644 --- a/tests/e2e/singlecard/test_llama32_lora.py +++ b/tests/e2e/singlecard/test_llama32_lora.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest +from unittest.mock import patch +import pytest import vllm import vllm.config from vllm.lora.request import LoRARequest -from unittest.mock import patch from tests.e2e.conftest import VllmRunner from vllm_ascend.utils import enable_custom_op @@ -53,17 +53,12 @@ def do_sample( PROMPT_TEMPLATE.format(context="How many candidates are there?"), PROMPT_TEMPLATE.format(context="Count the number of candidates."), PROMPT_TEMPLATE.format( - context= - "Which poll resource provided the most number of candidate information?" # noqa: E501 + context="Which poll resource provided the most number of candidate information?" # noqa: E501 ), - PROMPT_TEMPLATE.format( - context= - "Return the poll resource associated with the most candidates."), + PROMPT_TEMPLATE.format(context="Return the poll resource associated with the most candidates."), ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=64, - stop=["<|im_end|>"]) + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop=["<|im_end|>"]) if tensorizer_config_dict is not None: outputs = llm.generate( prompts, @@ -73,14 +68,15 @@ def do_sample( lora_id, lora_path, tensorizer_config_dict=tensorizer_config_dict, - ) if lora_id else None, + ) + if lora_id + else None, ) else: outputs = llm.generate( prompts, sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None, ) generated_texts: list[str] = [] @@ -92,33 +88,40 @@ def do_sample( return generated_texts -def generate_and_test(llm, - llama32_lora_files, - tensorizer_config_dict: dict | None = None): +def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | None = None): print("lora adapter created") print("lora 1") - assert (do_sample( - llm, - llama32_lora_files, - tensorizer_config_dict=tensorizer_config_dict, - lora_id=1, - ) == EXPECTED_LORA_OUTPUT) + assert ( + do_sample( + llm, + llama32_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1, + ) + == EXPECTED_LORA_OUTPUT + ) print("lora 2") - assert (do_sample( - llm, - llama32_lora_files, - tensorizer_config_dict=tensorizer_config_dict, - lora_id=2, - ) == EXPECTED_LORA_OUTPUT) + assert ( + do_sample( + llm, + llama32_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=2, + ) + == EXPECTED_LORA_OUTPUT + ) print("base model") - assert (do_sample( - llm, - llama32_lora_files, - tensorizer_config_dict=tensorizer_config_dict, - lora_id=0, - ) == EXPECTED_BASE_MODEL_OUTPUT) + assert ( + do_sample( + llm, + llama32_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0, + ) + == EXPECTED_BASE_MODEL_OUTPUT + ) print("removing lora") diff --git a/tests/e2e/singlecard/test_models.py b/tests/e2e/singlecard/test_models.py index 659b5f69..fcbde3b6 100644 --- a/tests/e2e/singlecard/test_models.py +++ b/tests/e2e/singlecard/test_models.py @@ -45,9 +45,7 @@ def test_minicpm(model) -> None: ] max_tokens = 5 - with VllmRunner(model, - max_model_len=512, - gpu_memory_utilization=0.7) as runner: + with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner: runner.generate_greedy(example_prompts, max_tokens) @@ -56,19 +54,12 @@ def test_whisper(model) -> None: prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"] audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate] - sampling_params = SamplingParams(temperature=0.2, - max_tokens=10, - stop_token_ids=None) + sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None) - with VllmRunner(model, - max_model_len=448, - max_num_seqs=5, - dtype="bfloat16", - block_size=128, - gpu_memory_utilization=0.9) as runner: - outputs = runner.generate(prompts=prompts, - audios=audios, - sampling_params=sampling_params) + with VllmRunner( + model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", block_size=128, gpu_memory_utilization=0.9 + ) as runner: + outputs = runner.generate(prompts=prompts, audios=audios, sampling_params=sampling_params) assert outputs is not None, "Generated outputs should not be None." assert len(outputs) > 0, "Generated outputs should not be empty." diff --git a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py index caf09bd9..f673b022 100644 --- a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py +++ b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py @@ -39,59 +39,56 @@ def test_models_with_multistream_overlap_shared_expert( max_tokens: int, ) -> None: prompts = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ] sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) with VllmRunner( - model, - max_model_len=1024, - enforce_eager=True, - cudagraph_capture_sizes=[4, 8, 16, 32], - additional_config={ - "multistream_overlap_shared_expert": True, - }, - quantization="ascend", + model, + max_model_len=1024, + enforce_eager=True, + cudagraph_capture_sizes=[4, 8, 16, 32], + additional_config={ + "multistream_overlap_shared_expert": True, + }, + quantization="ascend", ) as runner: - vllm_moe_ms_eager_outputs = runner.model.generate( - prompts, sampling_params) + vllm_moe_ms_eager_outputs = runner.model.generate(prompts, sampling_params) with VllmRunner( - model, - max_model_len=1024, - cudagraph_capture_sizes=[4, 8, 16, 32], - additional_config={ - "multistream_overlap_shared_expert": True, - }, - quantization="ascend", + model, + max_model_len=1024, + cudagraph_capture_sizes=[4, 8, 16, 32], + additional_config={ + "multistream_overlap_shared_expert": True, + }, + quantization="ascend", ) as runner: - vllm_moe_ms_aclgraph_outputs = runner.model.generate( - prompts, sampling_params) + vllm_moe_ms_aclgraph_outputs = runner.model.generate(prompts, sampling_params) with VllmRunner( - model, - max_model_len=1024, - enforce_eager=True, - cudagraph_capture_sizes=[4, 8, 16, 32], - quantization="ascend", + model, + max_model_len=1024, + enforce_eager=True, + cudagraph_capture_sizes=[4, 8, 16, 32], + quantization="ascend", ) as runner: vllm_eager_outputs = runner.model.generate(prompts, sampling_params) vllm_moe_ms_eager_outputs_list = [] for output in vllm_moe_ms_eager_outputs: - vllm_moe_ms_eager_outputs_list.append( - (output.outputs[0].index, output.outputs[0].text)) + vllm_moe_ms_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text)) vllm_moe_ms_aclgraph_outputs_list = [] for output in vllm_moe_ms_aclgraph_outputs: - vllm_moe_ms_aclgraph_outputs_list.append( - (output.outputs[0].index, output.outputs[0].text)) + vllm_moe_ms_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text)) vllm_eager_outputs_list = [] for output in vllm_eager_outputs: - vllm_eager_outputs_list.append( - (output.outputs[0].index, output.outputs[0].text)) + vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text)) check_outputs_equal( outputs_0_lst=vllm_eager_outputs_list, diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py index 119be0c2..b50ac3cf 100644 --- a/tests/e2e/singlecard/test_quantization.py +++ b/tests/e2e/singlecard/test_quantization.py @@ -19,6 +19,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal +# fmt: off def test_qwen3_w8a8_quant(): max_tokens = 5 example_prompts = [ @@ -29,6 +30,7 @@ def test_qwen3_w8a8_quant(): 13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387 ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be' )] +# fmt: on with VllmRunner( "vllm-ascend/Qwen3-0.6B-W8A8", @@ -47,7 +49,7 @@ def test_qwen3_w8a8_quant(): name_1="vllm_quant_w8a8_outputs", ) - +# fmt: off def test_qwen3_dense_w8a16(): max_tokens = 5 example_prompts = [ @@ -58,6 +60,7 @@ def test_qwen3_dense_w8a16(): 13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387 ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be' )] +# fmt: on with VllmRunner( "vllm-ascend/Qwen3-0.6B-W8A16", diff --git a/tests/e2e/singlecard/test_qwen3_multi_loras.py b/tests/e2e/singlecard/test_qwen3_multi_loras.py index 60d61325..53beaf43 100644 --- a/tests/e2e/singlecard/test_qwen3_multi_loras.py +++ b/tests/e2e/singlecard/test_qwen3_multi_loras.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import patch + from vllm import SamplingParams from vllm.lora.request import LoRARequest -from unittest.mock import patch from tests.e2e.conftest import VllmRunner from vllm_ascend.utils import enable_custom_op @@ -27,16 +28,11 @@ LORA_TEST_EXPECTED = [ def format_chatml_messages(prompt: str): return [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": prompt - }, + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt}, ] + @patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"}) def test_multi_loras_with_tp_sync(): lora_name_id_map = {} @@ -102,9 +98,7 @@ def test_multi_loras_with_tp_sync(): outputs = llm.chat( [messages], sampling_params, - chat_template_kwargs={ - "enable_thinking": False - }, # for those loras, ensure enable_thinking=False + chat_template_kwargs={"enable_thinking": False}, # for those loras, ensure enable_thinking=False lora_request=lora_request, use_tqdm=False, ) @@ -113,15 +107,13 @@ def test_multi_loras_with_tp_sync(): def reload_lora(name: str): """ - reload a lora to simulate the case: - setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` + reload a lora to simulate the case: + setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` for dynamic lora loading and unloading """ - remove_lora_response = llm.llm_engine.remove_lora( - lora_id=lora_name_id_map[name]) + remove_lora_response = llm.llm_engine.remove_lora(lora_id=lora_name_id_map[name]) - add_lora_response = llm.llm_engine.add_lora( - make_add_lora_request(name, LORA_NAME_PATH_MAP[name])) + add_lora_response = llm.llm_engine.add_lora(make_add_lora_request(name, LORA_NAME_PATH_MAP[name])) print(f"{remove_lora_response=}, {add_lora_response=}") @@ -131,7 +123,6 @@ def test_multi_loras_with_tp_sync(): assert outputs == expected for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED): - output_text = call_llm_get_outputs(prompt, "Alice") check_outputs(output_text, expected_output, prompt) diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py index 894977b8..9e64276a 100644 --- a/tests/e2e/singlecard/test_sampler.py +++ b/tests/e2e/singlecard/test_sampler.py @@ -25,15 +25,11 @@ def test_qwen3_topk() -> None: example_prompts = [ "Hello, my name is", ] - sampling_params = SamplingParams(max_tokens=5, - temperature=0.0, - top_k=50, - top_p=0.9) + sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9) - with VllmRunner("Qwen/Qwen3-0.6B", - max_model_len=8192, - cudagraph_capture_sizes=[1, 2, 4, 8], - gpu_memory_utilization=0.7) as runner: + with VllmRunner( + "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7 + ) as runner: runner.generate(example_prompts, sampling_params) @@ -42,29 +38,25 @@ def test_qwen3_prompt_logprobs() -> None: "Hello, my name is", ] - with VllmRunner("Qwen/Qwen3-0.6B", - max_model_len=8192, - cudagraph_capture_sizes=[1, 2, 4, 8], - gpu_memory_utilization=0.7) as runner: - runner.generate_greedy_logprobs(example_prompts, - max_tokens=5, - num_logprobs=1) + with VllmRunner( + "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7 + ) as runner: + runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1) def test_qwen3_exponential_overlap() -> None: example_prompts = [ "Hello, my name is", ] - sampling_params = SamplingParams(max_tokens=5, - temperature=1.0, - top_k=50, - top_p=0.9) + sampling_params = SamplingParams(max_tokens=5, temperature=1.0, top_k=50, top_p=0.9) - with VllmRunner("Qwen/Qwen3-0.6B", - max_model_len=8192, - cudagraph_capture_sizes=[1, 2, 4, 8], - gpu_memory_utilization=0.7, - additional_config={ - "enable_async_exponential": True, - }) as runner: + with VllmRunner( + "Qwen/Qwen3-0.6B", + max_model_len=8192, + cudagraph_capture_sizes=[1, 2, 4, 8], + gpu_memory_utilization=0.7, + additional_config={ + "enable_async_exponential": True, + }, + ) as runner: runner.generate(example_prompts, sampling_params) diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 33e896ff..e878f78a 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -20,6 +20,7 @@ Run `pytest tests/test_offline_inference.py`. """ + import os from unittest.mock import patch @@ -44,11 +45,13 @@ def test_multimodal_vl(vl_config): images = [image] * len(img_questions) prompts = vl_config["prompt_fn"](img_questions) - with VllmRunner(vl_config["model"], - mm_processor_kwargs=vl_config["mm_processor_kwargs"], - max_model_len=8192, - cudagraph_capture_sizes=[1, 2, 4, 8], - limit_mm_per_prompt={"image": 1}) as vllm_model: + with VllmRunner( + vl_config["model"], + mm_processor_kwargs=vl_config["mm_processor_kwargs"], + max_model_len=8192, + cudagraph_capture_sizes=[1, 2, 4, 8], + limit_mm_per_prompt={"image": 1}, + ) as vllm_model: outputs = vllm_model.generate_greedy( prompts=prompts, images=images, @@ -63,35 +66,30 @@ def test_multimodal_vl(vl_config): @patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"}) def test_multimodal_audio(): - audio_prompt = "".join([ - f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" - for idx in range(2) - ]) + audio_prompt = "".join([f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(2)]) question = "What sport and what nursery rhyme are referenced?" - prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n" - f"{audio_prompt}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) mm_data = { - "audio": [ - asset.audio_and_sample_rate for asset in - [AudioAsset("mary_had_lamb"), - AudioAsset("winning_call")] - ] + "audio": [asset.audio_and_sample_rate for asset in [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]] } inputs = {"prompt": prompt, "multi_modal_data": mm_data} - sampling_params = SamplingParams(temperature=0.2, - max_tokens=10, - stop_token_ids=None) + sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None) - with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct", - max_model_len=4096, - max_num_seqs=5, - dtype="bfloat16", - limit_mm_per_prompt={"audio": 2}, - cudagraph_capture_sizes=[1, 2, 4, 8], - gpu_memory_utilization=0.9) as runner: + with VllmRunner( + "Qwen/Qwen2-Audio-7B-Instruct", + max_model_len=4096, + max_num_seqs=5, + dtype="bfloat16", + limit_mm_per_prompt={"audio": 2}, + cudagraph_capture_sizes=[1, 2, 4, 8], + gpu_memory_utilization=0.9, + ) as runner: outputs = runner.generate(inputs, sampling_params=sampling_params) assert outputs is not None, "Generated outputs should not be None." diff --git a/tests/e2e/singlecard/test_xlite.py b/tests/e2e/singlecard/test_xlite.py index 8de3972b..231cb408 100644 --- a/tests/e2e/singlecard/test_xlite.py +++ b/tests/e2e/singlecard/test_xlite.py @@ -20,13 +20,14 @@ Compare the outputs of vLLM with and without xlite. Run `pytest tests/e2e/singlecard/test_xlite.py`. """ +# ruff: noqa: E501 + import os import pytest from vllm import SamplingParams -from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase, - gen_and_valid) +from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2" @@ -35,9 +36,9 @@ CASE_DECODE_ONLY = LLMTestCase( prompts=PROMPTS_SHORT, golden_answers=[ "Hello, my name is Lina. I'm a 22-year-old student from China.", - 'The president of the United States is the same as the president of the United Nations. This is because the president', - 'The capital of France is Paris. The capital of France is also the capital of the French Republic.', - 'The future of AI is not just a technological challenge but a profound transformation of how we live, work' + "The president of the United States is the same as the president of the United Nations. This is because the president", + "The capital of France is Paris. The capital of France is also the capital of the French Republic.", + "The future of AI is not just a technological challenge but a profound transformation of how we live, work", ], sampling_params=SamplingParams( max_tokens=15, @@ -45,19 +46,22 @@ CASE_DECODE_ONLY = LLMTestCase( top_p=1.0, top_k=0, n=1, - )) + ), +) CASE_FULL = LLMTestCase( model="Qwen/Qwen3-0.6B", prompts=[ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ], golden_answers=[ " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the", - ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president', - ' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital', - " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and" + " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president", + " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital", + " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and", ], sampling_params=SamplingParams( max_tokens=32, @@ -65,27 +69,25 @@ CASE_FULL = LLMTestCase( top_p=1.0, top_k=0, n=1, - )) + ), +) -@pytest.mark.skip( - reason="TODO: Re-enable xlite_decode_only e2e test when stable.") +@pytest.mark.skip(reason="TODO: Re-enable xlite_decode_only e2e test when stable.") @pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY]) def test_models_with_xlite_decode_only(cur_case: LLMTestCase): runner_kwargs = { "model_name": cur_case.model, "max_model_len": 1024, "block_size": 128, - "additional_config": { - "xlite_graph_config": { - "enabled": True - } - }, + "additional_config": {"xlite_graph_config": {"enabled": True}}, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) @pytest.mark.parametrize("cur_case", [CASE_FULL]) @@ -94,14 +96,11 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase): "model_name": cur_case.model, "max_model_len": 1024, "block_size": 128, - "additional_config": { - "xlite_graph_config": { - "enabled": True, - "full_mode": True - } - }, + "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}}, } - gen_and_valid(runner_kwargs=runner_kwargs, - prompts=cur_case.prompts, - sampling_params=cur_case.sampling_params, - golden_answers=cur_case.golden_answers) + gen_and_valid( + runner_kwargs=runner_kwargs, + prompts=cur_case.prompts, + sampling_params=cur_case.sampling_params, + golden_answers=cur_case.golden_answers, + ) diff --git a/tests/e2e/singlecard/utils.py b/tests/e2e/singlecard/utils.py index b9ada6c8..1ac30acb 100644 --- a/tests/e2e/singlecard/utils.py +++ b/tests/e2e/singlecard/utils.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Optional from vllm import SamplingParams @@ -7,37 +6,44 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal PROMPTS_SHORT = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ] # NOTE: Randomly fill the prompt with the requested amount for # the specified capture shape to prevent accuracy issues caused by padding PROMPTS_LONG = [ - ('Solve the following math problem step by step.' - 'The last line of your response should be of the form Answer: ' - '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' - 'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$' - 'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,' - '$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.' - 'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,' - 'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.' - ), - ('Solve the following math problem step by step.' - 'The last line of your response should be of the form Answer: ' - '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' - 'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen' - 'independently and uniformly at random on the perimeter of $ABCD$.' - 'If the expected value of the area of triangle $\\triangle AXY$' - 'can be expressed as $\\frac{m}{n}$, for relatively prime positive' - 'integers $m$ and $n$, compute $m+n$.'), - ('Solve the following math problem step by step.' - 'The last line of your response should be of the form Answer: ' - '$Answer (without quotes) where $Answer is the answer to the problem.\n\n' - 'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$' - 'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$' - 'and $x^2 + cx + b = 0$ also have a common real root.' - 'Compute the sum $a + b + c$.') + ( + "Solve the following math problem step by step." + "The last line of your response should be of the form Answer: " + "$Answer (without quotes) where $Answer is the answer to the problem.\n\n" + "In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$" + "be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$," + "$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$." + "If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$," + "where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$." + ), + ( + "Solve the following math problem step by step." + "The last line of your response should be of the form Answer: " + "$Answer (without quotes) where $Answer is the answer to the problem.\n\n" + "Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen" + "independently and uniformly at random on the perimeter of $ABCD$." + "If the expected value of the area of triangle $\\triangle AXY$" + "can be expressed as $\\frac{m}{n}$, for relatively prime positive" + "integers $m$ and $n$, compute $m+n$." + ), + ( + "Solve the following math problem step by step." + "The last line of your response should be of the form Answer: " + "$Answer (without quotes) where $Answer is the answer to the problem.\n\n" + "Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$" + "and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$" + "and $x^2 + cx + b = 0$ also have a common real root." + "Compute the sum $a + b + c$." + ), ] @@ -46,7 +52,7 @@ class LLMTestCase: model: str prompts: list[str] golden_answers: list[str] - quantization: Optional[str] = None + quantization: str | None = None sampling_params: SamplingParams = field( default_factory=lambda: SamplingParams( max_tokens=32, @@ -54,14 +60,13 @@ class LLMTestCase: top_p=1.0, top_k=0, n=1, - )) + ) + ) -def gen_and_valid(runner_kwargs: dict, prompts: list[str], - sampling_params: SamplingParams, golden_answers: list[str]): +def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]): with VllmRunner(**runner_kwargs) as runner: - vllm_aclgraph_outputs = runner.model.generate( - prompts=prompts, sampling_params=sampling_params) + vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params) outputs_gen = [] for output in vllm_aclgraph_outputs: outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))