Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+
+import pytest
+
+from vllm.config import CompilationMode
+from vllm.utils.torch_utils import cuda_device_count_stateless
+
+from ...utils import compare_all_settings
+
+
+@dataclasses.dataclass
+class TestSetting:
+    model: str
+    model_args: list[str]
+    pp_size: int
+    tp_size: int
+    attn_backend: str
+    method: str
+
+
+# we cannot afford testing the full Cartesian product
+# of all models and all modes
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=["--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq", "--max-model-len", "2048"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=["--max-model-len", "2048"],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=[
+                "--runner",
+                "pooling",
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+            ],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+        ),
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--runner", "pooling"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+        ),
+        # vision language model
+        # See https://github.com/vllm-project/vllm/issues/26716.
+        # TestSetting(
+        #     model="microsoft/Phi-3.5-vision-instruct",
+        #     model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        #     pp_size=2,
+        #     tp_size=1,
+        #     attn_backend="FLASH_ATTN",
+        #     method="generate_with_image",
+        # ),
+    ],
+)
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    model = test_setting.model
+    model_args = test_setting.model_args
+    pp_size = test_setting.pp_size
+    tp_size = test_setting.tp_size
+    attn_backend = test_setting.attn_backend
+    method = test_setting.method
+    if cuda_device_count_stateless() < pp_size * tp_size:
+        pytest.skip(
+            f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
+            f"{cuda_device_count_stateless()}"
+        )
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            *model_args,
+            "-pp",
+            str(pp_size),
+            "-tp",
+            str(tp_size),
+            "-cc.cudagraph_mode=none",
+        ]
+
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []
+
+        for comp_mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
+        ]:
+            for mode in [CompilationMode.NONE, comp_mode]:
+                all_args.append(
+                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
+                )
+
+            # inductor will change the output, so we only compare if the output
+            # is close, not exactly the same.
+            compare_all_settings(
+                model,
+                all_args,
+                all_envs,
+                method=method if method != "generate" else "generate_close",
+            )
+            all_envs.clear()
+            all_args.clear()
+
+        for mode in [
+            CompilationMode.NONE,
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
+        ]:
+            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+            all_envs.append({})
+            all_envs.append({})
+
+        compare_all_settings(model, all_args * 3, all_envs, method=method)