Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,155 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
import pytest
from vllm.config import CompilationMode
from vllm.utils.torch_utils import cuda_device_count_stateless
from ...utils import compare_all_settings
@dataclasses.dataclass
class TestSetting:
model: str
model_args: list[str]
pp_size: int
tp_size: int
attn_backend: str
method: str
# we cannot afford testing the full Cartesian product
# of all models and all modes
@pytest.mark.parametrize(
"test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=["--max-model-len", "2048"],
pp_size=2,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq", "--max-model-len", "2048"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=["--max-model-len", "2048"],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=[
"--runner",
"pooling",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
),
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--runner", "pooling"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
),
# vision language model
# See https://github.com/vllm-project/vllm/issues/26716.
# TestSetting(
# model="microsoft/Phi-3.5-vision-instruct",
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
# pp_size=2,
# tp_size=1,
# attn_backend="FLASH_ATTN",
# method="generate_with_image",
# ),
],
)
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
model = test_setting.model
model_args = test_setting.model_args
pp_size = test_setting.pp_size
tp_size = test_setting.tp_size
attn_backend = test_setting.attn_backend
method = test_setting.method
if cuda_device_count_stateless() < pp_size * tp_size:
pytest.skip(
f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
f"{cuda_device_count_stateless()}"
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-cc.cudagraph_mode=none",
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
)
all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method)

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import os
import weakref
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
@contextlib.contextmanager
def temporary_environ(env_vars):
"""
Temporarily set environment variables and restore them afterward.
We have to do this vs monkeypatch because monkeypatch doesn't work
with "module" scoped fixtures.
"""
original_env = {k: os.environ.get(k) for k in env_vars}
try:
os.environ.update(env_vars)
yield
finally:
for k, v in original_env.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
model_backends_full_cudagraph = []
# deepseek-ai/DeepSeek-V2-Lite with MLA
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
for mla_backend in MLA_backends:
model_backends_full_cudagraph.append(
("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
)
# Qwen/Qwen2-1.5B-Instruct with other backends
other_backend_configs = [
backend_configs[c] for c in backend_configs if c not in MLA_backends
]
for backend_config in other_backend_configs:
model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
@pytest.fixture(scope="class")
def llm_pair(request):
model, backend_config, use_inductor_graph_partition = request.param
backend_config.comp_config["use_inductor_graph_partition"] = (
use_inductor_graph_partition
)
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition only supported in torch>=2.9")
# Dynamically skip test if GPU capability is not met
if (
backend_config.specific_gpu_arch
and backend_config.specific_gpu_arch != current_platform.get_device_capability()
):
if backend_config.specific_gpu_arch == (9, 0):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
elif backend_config.specific_gpu_arch == (10, 0):
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
env_vars = {
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
**backend_config.env_vars,
}
with temporary_environ(env_vars):
full = LLM(
model=model,
gpu_memory_utilization=0.43,
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=128,
compilation_config=CompilationConfig(**backend_config.comp_config),
generation_config="vllm",
seed=42,
)
piecewise = LLM(
model=model,
gpu_memory_utilization=0.43,
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=128,
compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"),
generation_config="vllm",
seed=42,
)
# PyTest caches the fixture values so we use weakref.proxy to enable GC
yield weakref.proxy(full), weakref.proxy(piecewise)
del full
del piecewise
wait_for_gpu_memory_to_clear(
devices=[0],
threshold_ratio=0.1,
)
@pytest.mark.parametrize(
"llm_pair",
[
pytest.param((model, backend_config, use_inductor_graph_partition))
for model, backend_config in model_backends_full_cudagraph
for use_inductor_graph_partition in [True, False]
],
indirect=True,
)
class TestFullCUDAGraph:
"""
Use a class such that an llm pair is constructed once for all
batch_size/max_tokens combinations and released immediately after.
Module-scope fixtures would stick around the whole time,
meaning there would be multiple LLM instances hogging memory simultaneously.
"""
@pytest.mark.parametrize(
("batch_size", "max_tokens"),
[
(1, 10),
(7, 10),
(16, 10),
(25, 10),
(32, 10),
(45, 10),
(64, 10),
(123, 10),
(8, 5),
(8, 30),
],
)
def test_full_cudagraph(self, batch_size, max_tokens, llm_pair: tuple[LLM, LLM]):
"""
Test various batch sizes and max_tokens to ensure that the
full cudagraph compilation works for padded cases too.
"""
full_cudagraph_llm, piecewise_llm = llm_pair
prompts = ["the quick brown fox"] * batch_size
# Use purely greedy decoding to avoid top-p truncation sensitivity
# that can amplify tiny numeric differences across runtimes.
sampling_params = SamplingParams(
temperature=0.0, max_tokens=max_tokens, top_p=1.0
)
piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
# Check that all responses are the same
for piecewise_res, full_res in zip(piecewise_responses, full_responses):
assert (
piecewise_res.outputs[0].text.lower()
== full_res.outputs[0].text.lower()
)
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
),
pytest.raises(RuntimeError),
):
LLM(
model="Qwen/Qwen2-1.5B-Instruct",
compilation_config=CompilationConfig(cudagraph_mode="FULL"),
)

View File

@@ -0,0 +1,250 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from pathlib import Path
from typing import Any
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
def models_list(*, all: bool = True, keywords: list[str] | None = None):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
(
"neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
{"dtype": torch.float16},
),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if all:
TEST_MODELS.extend(
[
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
{"dtype": torch.float16},
),
]
)
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
)
if is_quant_method_supported("gptq"):
TEST_MODELS.append(
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
)
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
{"quantization": "gptq_marlin"},
)
)
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(
(
"alexm-nm/tinyllama-24-marlin24-4bit-g128",
{"quantization": "gptq_marlin_24"},
)
)
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {"quantization": "AWQ"})
)
if keywords is None:
return TEST_MODELS
# filter by keywords
pred = lambda model: any(keyword in model[0] for keyword in keywords)
return list(filter(pred, TEST_MODELS))
@pytest.mark.parametrize(
"compilation_mode",
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
)
@pytest.mark.parametrize("model, model_kwargs", models_list(all=True))
@create_new_process_for_each_test()
def test_full_graph(
monkeypatch: pytest.MonkeyPatch,
model: str,
model_kwargs: dict[str, Any],
compilation_mode: int,
):
if (
"w8a8" in model
or "w8w8" in model
and current_platform.has_device_capability((10, 0))
):
# int8 removed on Blackwell:
pytest.skip("int8 support removed on Blackwell")
with monkeypatch.context():
print(f"MODEL={model}")
run_model(compilation_mode, model, **model_kwargs)
# TODO(luka) add other supported compilation config scenarios here
@pytest.mark.parametrize(
"compilation_config, model, model_kwargs",
[
# additional compile sizes, only some of the models
(
CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
*model_info,
)
for model_info in models_list(all=False)
]
+ [
# RMSNorm + quant fusion, only 8-bit quant models
(
CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm"],
pass_config=PassConfig(
fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
),
),
*model_info,
)
for model_info in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
]
+ [
# Test depyf integration works
(
CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
debug_dump_path=Path(tempfile.gettempdir()),
),
"facebook/opt-125m",
{},
),
]
+ [
# graph inductor partition
(
CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
# inductor graph partition uses
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
use_inductor_graph_partition=True,
cudagraph_mode=CUDAGraphMode.PIECEWISE,
compile_sizes=[1, 2],
),
*model_info,
)
for model_info in models_list(all=False)
if is_torch_equal_or_newer("2.9.0.dev")
],
)
# only test some of the models
@create_new_process_for_each_test()
def test_custom_compile_config(
compilation_config: CompilationConfig,
model: str,
model_kwargs: dict[str, Any],
):
if (
"w8a8" in model
or "w8w8" in model
and current_platform.has_device_capability((10, 0))
):
# int8 removed on Blackwell:
pytest.skip("int8 support removed on Blackwell")
if compilation_config.use_inductor_graph_partition and not is_torch_equal_or_newer(
"2.9.0.dev"
):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
print(f"MODEL={model}")
run_model(compilation_config, model, **model_kwargs)
@pytest.mark.parametrize(
"compilation_mode",
[CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
)
@pytest.mark.parametrize(
"model, backend",
[
("Qwen/Qwen2-0.5B", None), # Standard attention model
(
"deepseek-ai/DeepSeek-V2-Lite",
AttentionBackendEnum.FLASHINFER_MLA,
), # MLA (Multi-head Latent Attention) model
],
)
def test_fp8_kv_scale_compile(
monkeypatch: pytest.MonkeyPatch,
compilation_mode: int,
model: str,
backend: AttentionBackendEnum | None,
):
if backend:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True,
"max_model_len": 512,
}
run_model(compilation_mode, model, **model_kwargs)
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
compilation_config = (
compile_config
if isinstance(compile_config, CompilationConfig)
else CompilationConfig(mode=compile_config)
)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
# Allow override from model_kwargs
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
# No cudagraphs by default
if compilation_config.cudagraph_mode is None:
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
llm = LLM(
model=model,
compilation_config=compilation_config,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig
from vllm.config.compilation import CompilationMode
from vllm.platforms import current_platform
def test_compile():
vllm_config = VllmConfig()
# Default configuration does not compile mm encoder
assert not vllm_config.compilation_config.compile_mm_encoder
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
"""Test that Qwen2.5-VL vision submodules are compiled.
This test verifies that the 3 vision submodules (Qwen2_5_VisionPatchEmbed,
Qwen2_5_VisionBlock, and Qwen2_5_VisionPatchMerger) are properly tagged
for compilation by checking that num_models_seen increases by at least 3.
"""
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with (
# NOTE: Qwen2.5-VL has 35 models in total - the LLM backend
# Vision Patch Embed, Vision Patch Merger, and then 32 Vision Blocks
# (one for each layer) - in the future, we should fix vLLM compilation
# logic to handle this case and only compile the Vision submodules once
# and reuse the compiled code for all layers
# See https://github.com/vllm-project/vllm/issues/27590
compilation_counter.expect(num_models_seen=35),
vllm_runner(
"Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=2048,
gpu_memory_utilization=0.8,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"compile_mm_encoder": True,
},
) as _,
):
pass
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
"""Test that Qwen2.5-VL vision submodules are not compiled when the
config is passed off
"""
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with (
compilation_counter.expect(num_models_seen=1),
vllm_runner(
"Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=2048,
gpu_memory_utilization=0.8,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"compile_mm_encoder": False,
},
) as _,
):
pass

View File

@@ -0,0 +1,326 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test (piecewise) compilation with a simple model where multiple submodules
are compiled and graph captured separately.
"""
import pytest
import torch
from torch import nn
from vllm.compilation.backends import set_model_tag
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
from vllm.config import (
CompilationConfig,
CompilationMode,
CUDAGraphMode,
VllmConfig,
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
BATCH_SIZE = 32
MLP_SIZE = 128
HIDDEN_SIZE = 1024
RANDOM_SEED = 0
@support_torch_compile
class ParentModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x
class Attention(nn.Module):
def __init__(self, mlp_size: int, hidden_size: int) -> None:
super().__init__()
self.pre_attn = nn.Linear(mlp_size, hidden_size, bias=False)
self.post_attn = nn.Linear(hidden_size, mlp_size, bias=False)
self.rms_norm_weight = nn.Parameter(torch.ones(hidden_size))
# Initialize to same weights for testing
nn.init.xavier_normal_(
self.pre_attn.weight.data,
generator=torch.Generator().manual_seed(RANDOM_SEED),
gain=0.001,
)
nn.init.xavier_normal_(
self.post_attn.weight.data,
generator=torch.Generator().manual_seed(RANDOM_SEED),
gain=0.001,
)
def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
x_f32 = x.float()
return (
x_f32
* torch.rsqrt(torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6)
* self.rms_norm_weight
).to(x.dtype)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.pre_attn(x)
x = self.rms_norm_ref(x)
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = self.rms_norm_ref(x)
x = self.post_attn(x)
return x
@support_torch_compile
class CompiledAttention(nn.Module):
def __init__(
self,
*,
mlp_size: int,
hidden_size: int,
vllm_config: VllmConfig,
prefix: str = "",
**kwargs,
) -> None:
super().__init__()
self.attn = Attention(mlp_size, hidden_size)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.attn(x)
@support_torch_compile
class CompiledAttentionTwo(CompiledAttention):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.attn(x) + x
@ignore_torch_compile
class SimpleModelWithTwoGraphs(ParentModel):
def __init__(
self,
*,
mlp_size: int,
hidden_size: int,
vllm_config: VllmConfig,
prefix: str = "",
**kwargs,
) -> None:
super().__init__(vllm_config=vllm_config, prefix=prefix)
# Test will fail without set_model_tag here with error:
# "ValueError: too many values to unpack (expected 3)"
# This is because CompiledAttention and CompiledAttentionTwo
# have different implementations but the same torch.compile
# cache dir will be used as default prefix is 'model_tag'
with set_model_tag("attn_one"):
self.attn_one = CompiledAttention(
mlp_size=mlp_size,
hidden_size=hidden_size,
vllm_config=vllm_config,
prefix=f"{prefix}.attn_one",
)
with set_model_tag("attn_two"):
self.attn_two = CompiledAttentionTwo(
mlp_size=mlp_size,
hidden_size=hidden_size,
vllm_config=vllm_config,
prefix=f"{prefix}.attn_two",
)
self.hidden_states = torch.zeros((BATCH_SIZE, MLP_SIZE)).cuda()
def forward(self, x: torch.Tensor) -> torch.Tensor:
bsz = x.shape[0]
# CUDAGraph expects same tensor addresses for each run
self.hidden_states[:bsz].copy_(x)
x = self.attn_one(self.hidden_states[:bsz])
self.hidden_states[:bsz].copy_(x)
x = self.attn_two(self.hidden_states[:bsz])
return x
@torch.inference_mode
def run_model(
vllm_config: VllmConfig,
model: nn.Module,
inputs: torch.Tensor,
cudagraph_runtime_mode: CUDAGraphMode,
):
with set_forward_context({}, vllm_config=vllm_config):
# warmup for the model with cudagraph_mode NONE
model(inputs)
# simulate cudagraphs capturing
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
model(inputs[:2])
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=1,
),
):
model(inputs[:1])
# simulate cudagraphs replay
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
output = model(inputs[:2])
output = output.cpu()
return output.cpu()
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@create_new_process_for_each_test("spawn")
def test_multi_graph_piecewise_compile(
use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
):
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
outputs = []
# vllmcompile compile
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.PIECEWISE,
splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2],
use_inductor_graph_partition=use_inductor_graph_partition,
)
)
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
with set_current_vllm_config(vllm_config):
model = (
SimpleModelWithTwoGraphs(
mlp_size=MLP_SIZE,
hidden_size=HIDDEN_SIZE,
vllm_config=vllm_config,
prefix="",
)
.eval()
.cuda()
)
# Pre-allocate memory for CUDAGraph which expects
# static tensor addresses
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
if use_inductor_graph_partition:
# Splitting happens at Inductor lowering level,
# total piecewise fx graphs is equal to total graphs
num_piecewise_fx = 2
num_piecewise_capturable_fx = 2
else:
# attn_one, attn_two each has 3 piecewise graphs
# (pre attn, post attn, silly_attention) each
num_piecewise_fx = 6
# attn_one, attn_two has pre attn and post attn each, total=4
num_piecewise_capturable_fx = 4
with compilation_counter.expect(
num_graphs_seen=2, # two graphs for the model
num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
num_backend_compilations=num_piecewise_capturable_fx,
num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions
):
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# no compile or cudagraph
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.NONE,
)
)
cudagraph_runtime_mode = CUDAGraphMode.NONE
with set_current_vllm_config(vllm_config):
model = (
SimpleModelWithTwoGraphs(
mlp_size=MLP_SIZE,
hidden_size=HIDDEN_SIZE,
vllm_config=vllm_config,
prefix="",
)
.eval()
.cuda()
)
with compilation_counter.expect(
num_graphs_seen=0,
num_piecewise_graphs_seen=0,
num_piecewise_capturable_graphs_seen=0,
num_backend_compilations=0,
num_cudagraph_captured=0,
):
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# piecewise compile without CUDA graph
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE,
splitting_ops=["silly::attention"],
use_inductor_graph_partition=use_inductor_graph_partition,
)
)
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
with set_current_vllm_config(vllm_config):
model = (
SimpleModelWithTwoGraphs(
mlp_size=MLP_SIZE,
hidden_size=HIDDEN_SIZE,
vllm_config=vllm_config,
prefix="",
)
.eval()
.cuda()
)
with compilation_counter.expect(
num_graphs_seen=2,
num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
num_backend_compilations=num_piecewise_capturable_fx,
num_cudagraph_captured=0, # no cudagraph captured
):
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# Generally don't expect outputs with and without inductor
# to be bitwise equivalent
assert torch.allclose(outputs[0], outputs[1])
# Expect bitwise equivalence using inductor w/ and w/o cudagraph
assert torch.equal(outputs[0], outputs[2])

View File

@@ -0,0 +1,167 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
"""
import pytest
import torch
from torch import nn
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (
CompilationConfig,
CompilationMode,
CUDAGraphMode,
VllmConfig,
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from ..silly_attention import get_global_counter, reset_global_counter
@support_torch_compile
class SillyModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Overall effect:
x = 3 * x + 19
global_counter += 2
"""
x = x + 1
x = x + 2
out = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, out)
x = out
x = x - 2
x = x - 1
out = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, out)
x = out
x = x + 1
return x
def _run_simple_model(
splitting_ops,
use_inductor_graph_partition,
backend,
expected_num_piecewise_graphs_seen,
expected_num_piecewise_capturable_graphs_seen,
expected_num_backend_compilations,
expected_num_cudagraph_captured,
):
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
backend=backend,
splitting_ops=splitting_ops,
use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_copy_inputs=True,
cudagraph_capture_sizes=[1, 2],
)
)
with set_current_vllm_config(vllm_config):
model = SillyModel(vllm_config=vllm_config, prefix="")
inputs = torch.randn(100).cuda()
with (
compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
num_backend_compilations=expected_num_backend_compilations,
num_cudagraph_captured=expected_num_cudagraph_captured,
),
set_forward_context(None, vllm_config=vllm_config),
): # background context
# warm up with background context
model(inputs)
# capturing/replaying should under context of cudagraph dispatching
with set_forward_context(
None,
vllm_config=vllm_config,
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
model(torch.randn(2).cuda())
with set_forward_context(
None,
vllm_config=vllm_config,
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
batch_descriptor=BatchDescriptor(
num_tokens=1,
),
):
model(torch.randn(1).cuda())
input = torch.zeros(2).cuda()
reset_global_counter()
with set_forward_context(
None,
vllm_config=vllm_config,
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
output = model(input)
assert get_global_counter() == 2
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(backend):
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=False,
backend=backend,
# 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5,
# 1 + num_layers
expected_num_piecewise_capturable_graphs_seen=3,
# num_piecewise_capturable_graphs_seen
expected_num_backend_compilations=3,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6,
)
@torch.inference_mode()
def test_simple_inductor_graph_partition(monkeypatch):
if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
# disable compile cache so that we run separately for different splitting_ops
# and get the expected number of cudagraphs captured.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=True,
backend="inductor",
# Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_backend_compilations=1,
# Inductor graph partition still captures 6 graph, same as fx graph partition
expected_num_cudagraph_captured=6,
)

View File

@@ -0,0 +1,523 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test the piecewise compilation with a simple model, comparing the output
with and without the piecewise compilation.
This is a tractable model, the weights and computation are specially designed
if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
"""
from copy import deepcopy
from dataclasses import dataclass
from typing import Any
import pytest
import torch
from torch import nn
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (
CompilationConfig,
CompilationMode,
CUDAGraphMode,
VllmConfig,
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@dataclass
class LlamaConfig:
hidden_size: int = 128
mlp_size: int = 256
vocab_size: int = 128
num_layers: int = 2
init_value: float = 1.0
tractable_init: bool = False
random_seed: int = 0
def compute_hash(self) -> str:
factors: list[Any] = []
for k, v in self.__dict__.items():
if k == "random_seed":
continue
factors.append((k, v))
factors.sort()
import hashlib
return hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
def __post_init__(self):
assert self.mlp_size >= self.hidden_size
class LlamaMLP(nn.Module):
def __init__(self, config: LlamaConfig) -> None:
super().__init__()
self.gate_up_projection = nn.Linear(
in_features=config.hidden_size,
out_features=config.mlp_size * 2,
bias=False,
)
self.down_projection = nn.Linear(
in_features=config.mlp_size,
out_features=config.hidden_size,
bias=False,
)
if config.tractable_init:
nn.init.eye_(self.gate_up_projection.weight.data[: config.mlp_size])
nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size :])
nn.init.eye_(self.down_projection.weight.data)
else:
nn.init.xavier_normal_(
self.gate_up_projection.weight.data,
generator=torch.Generator().manual_seed(config.random_seed),
gain=0.001,
)
nn.init.xavier_normal_(
self.down_projection.weight.data,
generator=torch.Generator().manual_seed(config.random_seed),
gain=0.001,
)
def forward(self, x):
# for tractable_init and positive input, this is
# essentially an elementwise-square
x = self.gate_up_projection(x)
x = x[:, : x.size(1) // 2] * torch.nn.functional.relu(x[:, x.size(1) // 2 :])
x = self.down_projection(x)
return x
class LlamaAttention(nn.Module):
def __init__(self, config: LlamaConfig) -> None:
super().__init__()
self.qkv_projection = nn.Linear(
in_features=config.hidden_size,
out_features=config.hidden_size * 3,
bias=False,
)
self.output_projection = nn.Linear(
in_features=config.hidden_size,
out_features=config.hidden_size,
bias=False,
)
if config.tractable_init:
nn.init.eye_(self.qkv_projection.weight.data[: config.hidden_size])
nn.init.eye_(
self.qkv_projection.weight.data[
config.hidden_size : 2 * config.hidden_size
]
)
nn.init.eye_(self.qkv_projection.weight.data[2 * config.hidden_size :])
nn.init.eye_(self.output_projection.weight.data)
else:
nn.init.xavier_normal_(
self.qkv_projection.weight.data,
generator=torch.Generator().manual_seed(config.random_seed),
gain=0.001,
)
nn.init.xavier_normal_(
self.output_projection.weight.data,
generator=torch.Generator().manual_seed(config.random_seed),
gain=0.001,
)
def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
) -> torch.Tensor:
# for tractable_init, this is:
# output = (hidden_states * 3 + positions * 2)
qkv = self.qkv_projection(hidden_states)
hidden_size = qkv.size(-1) // 3
q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
q = q + positions.unsqueeze(1)
k = k + positions.unsqueeze(1)
attn_output = torch.empty_like(q)
torch.ops.silly.attention(q, k, v, attn_output)
output = self.output_projection(attn_output)
return output
class LlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig) -> None:
super().__init__()
self.self_attention = LlamaAttention(config)
self.mlp = LlamaMLP(config)
def forward(
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
residual: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
For tractable computation:
- if residual is None, the outputs are:
- residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
- hidden_states = (residual + 1) ** 2
- if residual is not None, the outputs are:
- residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
- hidden_states = (residual + 1) ** 2
""" # noqa
if residual is None:
residual = hidden_states
hidden_states = hidden_states + 1
else:
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = hidden_states + 1
hidden_states = self.self_attention(
positions=positions, hidden_states=hidden_states
)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = hidden_states + 1
hidden_states = self.mlp(hidden_states)
return hidden_states, residual
@support_torch_compile
class LlamaModel(nn.Module):
def __init__(
self,
*,
vllm_config: VllmConfig,
config: LlamaConfig,
prefix: str = "",
**kwargs,
) -> None:
super().__init__()
self.embedding_tokens = nn.Embedding(
num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
)
self.layers = nn.ModuleList(
[LlamaDecoderLayer(config) for _ in range(config.num_layers)]
)
# this is the initial value of the hidden states
self.embedding_tokens.weight.data.fill_(config.init_value)
def forward(
self,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
) -> torch.Tensor:
hidden_states = self.embedding_tokens(input_ids)
residual = None
for layer in self.layers:
hidden_states, residual = layer(positions, hidden_states, residual)
return hidden_states
def tractable_computation(
input_ids: torch.Tensor,
positions: torch.Tensor,
config: LlamaConfig,
init_value: float = 1.0,
) -> torch.Tensor:
hidden_states = (
torch.ones(
input_ids.size(0),
config.hidden_size,
device=input_ids.device,
dtype=input_ids.dtype,
)
* init_value
)
# first layer
residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
hidden_states = (residual + 1) ** 2
# following layers
for _ in range(config.num_layers - 1):
hidden_states = hidden_states + residual
residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
hidden_states = (residual + 1) ** 2
return hidden_states
@torch.inference_mode
def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
# Start with a fresh copy to make sure there's no cache dir sharing
compile_config = deepcopy(compile_config)
cudagraph_runtime_mode = compile_config.cudagraph_mode
vllm_config = VllmConfig(
compilation_config=compile_config, additional_config=llama_config
)
with set_current_vllm_config(vllm_config):
model = (
LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
.eval()
.cuda()
)
with set_forward_context({}, vllm_config=vllm_config): # background context
B = 16 # max batch size
input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
positions = torch.arange(B).cuda()
# warmup for the model with cudagraph_mode NONE
model(input_ids, positions)
# simulate cudagraphs capturing
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
model(input_ids[:2], positions[:2])
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=1,
),
):
model(input_ids[:1], positions[:1])
input_ids[:2].zero_()
# simulate cudagraphs replay
with set_forward_context(
{},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2,
),
):
output = model(input_ids[:2], positions[:2])
output = output.cpu()
if llama_config.tractable_init:
expected_output = tractable_computation(
input_ids[:2], positions[:2], llama_config
).cpu()
assert torch.allclose(output, expected_output)
else:
return output.cpu()
@pytest.mark.parametrize(
"backend, use_inductor_graph_partition",
[
("eager", False), # No inductor
("inductor", False), # Inductor, Dynamo partition
("inductor", True), # Inductor, Inductor partition
],
)
@create_new_process_for_each_test("spawn")
def test_toy_llama(
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
):
# We disable the vLLM compile cache into a new tmp dir for 1 reason:
# 1. To make sure we can properly track the number of Inductor compilations.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition only supported in torch>=2.9")
# compare output with and without piecewise compilation
llama_config = LlamaConfig(
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=12
)
tractable_config = LlamaConfig(
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
)
compile_config_no_compile = CompilationConfig(
mode=CompilationMode.NONE,
cudagraph_mode=CUDAGraphMode.NONE,
backend="eager",
)
compile_config_no_split = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_mode=CUDAGraphMode.PIECEWISE,
backend=backend,
cudagraph_capture_sizes=[1, 2],
)
compile_config_split = deepcopy(compile_config_no_split)
compile_config_split.splitting_ops = ["silly::attention"]
outputs = []
with compilation_counter.expect(
num_graphs_seen=0,
num_piecewise_graphs_seen=0,
num_piecewise_capturable_graphs_seen=0,
num_backend_compilations=0,
num_cudagraph_captured=0,
):
outputs.append(run_model(llama_config, compile_config_no_compile))
run_model(tractable_config, compile_config_no_compile)
if backend == "inductor":
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
else:
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1,
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2,
**kwargs,
):
outputs.append(run_model(llama_config, compile_config_no_split))
run_model(tractable_config, compile_config_no_split)
if use_inductor_graph_partition:
num_piecewise_fx = 1
num_piecewise_capturable_fx = 1
else:
num_piecewise_fx = 2 * llama_config.num_layers + 1
num_piecewise_capturable_fx = 1 + llama_config.num_layers
with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
num_backend_compilations=num_piecewise_capturable_fx,
# num_cudagraph_sizes * num_partitions
num_cudagraph_captured=2 * (1 + llama_config.num_layers),
):
outputs.append(run_model(llama_config, compile_config_split))
run_model(tractable_config, compile_config_split)
for i in range(1, len(outputs)):
assert torch.allclose(outputs[0], outputs[i])
@torch.inference_mode
def benchmark():
from triton.testing import do_bench
# similar to llama 3.1-8B
llama_config = LlamaConfig(
hidden_size=4096, mlp_size=14336, vocab_size=128 * 1024, num_layers=32
)
# a tiny model to measure the overhead
# of piecewise cudagraph
llama_config = LlamaConfig(
hidden_size=40, mlp_size=80, vocab_size=128, num_layers=2
)
cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
eager_time = {}
full_cudagraph_time = {}
piecewise_cudagraph_time = {}
pool = torch.cuda.graph_pool_handle()
for piecewise in [False, True]:
if piecewise:
compilation_config = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
splitting_ops=["silly::attention"],
cudagraph_capture_sizes=cudagraph_sizes,
)
else:
compilation_config = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_capture_sizes=cudagraph_sizes,
)
vllm_config = VllmConfig(compilation_config=compilation_config)
with set_current_vllm_config(vllm_config):
model = (
LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
.eval()
.cuda()
.to(torch.bfloat16)
)
B = 256 # max batch size
input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
positions = torch.arange(B).cuda().to(torch.bfloat16)
graphs = {}
model(input_ids, positions)
for b in cudagraph_sizes[::-1]:
if not piecewise:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, pool=pool):
output = model(input_ids[:b], positions[:b])
graphs[b] = (graph, output)
else:
output = model(input_ids[:b], positions[:b])
graphs[b] = (model, output)
for b in cudagraph_sizes:
if piecewise:
# noqa is for `Function definition does not bind loop variable`
# it will be problematic if we save the created lambda function
# and use it later, because it will look up the name `b` in the
# enclosing scope, and the value of `b` will always be 256.
# it is fine here, because we only use the lambda function once.
runtime = do_bench(
lambda: graphs[b][0]( # noqa
input_ids[:b], # noqa
positions[:b], # noqa
)
)
piecewise_cudagraph_time[b] = runtime
else:
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
eager_runtime = do_bench(lambda: model(input_ids[:b], positions[:b])) # noqa
full_cudagraph_time[b] = runtime
eager_time[b] = eager_runtime
# print in tabular format
print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
for b in cudagraph_sizes:
print(
f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
f"\t{piecewise_cudagraph_time[b]:.3f}"
)
if __name__ == "__main__":
# Protect against subprocess reimport when using spawn_new_process_for_each_test
import os
if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
benchmark()