Files
sglang/python/sglang/srt/model_executor/cuda_graph_runner.py

287 lines
10 KiB
Python
Raw Normal View History

2024-07-28 23:07:12 +10:00
"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
2024-09-11 11:44:26 -07:00
"""Run the model with cuda graph and torch.compile."""
2024-07-13 05:29:46 -07:00
import bisect
2024-07-21 03:09:29 -07:00
from contextlib import contextmanager
2024-09-11 11:44:26 -07:00
from typing import Callable
2024-07-13 05:29:46 -07:00
import torch
from vllm.distributed.parallel_state import graph_capture
2024-07-20 18:34:37 -07:00
from vllm.model_executor.custom_op import CustomOp
2024-07-13 05:29:46 -07:00
from sglang.srt.layers.flashinfer_utils import update_flashinfer_indices
from sglang.srt.layers.logits_processor import (
LogitsMetadata,
LogitsProcessor,
2024-08-28 18:58:52 -07:00
LogitsProcessorOutput,
)
2024-08-28 18:58:52 -07:00
from sglang.srt.layers.sampler import SampleOutput
from sglang.srt.managers.schedule_batch import ScheduleBatch
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
2024-08-28 18:58:52 -07:00
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
2024-07-21 03:09:29 -07:00
from sglang.srt.utils import monkey_patch_vllm_all_gather
2024-07-13 05:29:46 -07:00
2024-07-21 03:09:29 -07:00
def _to_torch(model: torch.nn.Module, reverse: bool = False):
2024-07-20 18:34:37 -07:00
for sub in model._modules.values():
if isinstance(sub, CustomOp):
if reverse:
sub._forward_method = sub.forward_cuda
setattr(sub, "is_torch_compile", False)
2024-07-20 18:34:37 -07:00
else:
sub._forward_method = sub.forward_native
setattr(sub, "is_torch_compile", True)
2024-07-20 18:34:37 -07:00
if isinstance(sub, torch.nn.Module):
_to_torch(sub, reverse)
2024-07-21 03:09:29 -07:00
@contextmanager
def patch_model(
model: torch.nn.Module, enable_compile: bool, tp_group: "GroupCoordinator"
2024-07-21 03:09:29 -07:00
):
2024-09-11 11:44:26 -07:00
"""Patch the model to make it compatible with with torch.compile"""
2024-07-21 03:09:29 -07:00
backup_ca_comm = None
try:
if enable_compile:
2024-07-21 03:09:29 -07:00
_to_torch(model)
monkey_patch_vllm_all_gather()
backup_ca_comm = tp_group.ca_comm
tp_group.ca_comm = None
yield torch.compile(model.forward, mode="max-autotune-no-cudagraphs")
else:
yield model.forward
finally:
if enable_compile:
2024-07-21 03:09:29 -07:00
_to_torch(model, reverse=True)
monkey_patch_vllm_all_gather(reverse=True)
tp_group.ca_comm = backup_ca_comm
2024-07-20 18:34:37 -07:00
def set_torch_compile_config():
import torch._dynamo.config
import torch._inductor.config
torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.triton.unique_kernel_names = True
torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
# FIXME: tmp workaround
torch._dynamo.config.accumulated_cache_size_limit = 1024
2024-07-13 05:29:46 -07:00
class CudaGraphRunner:
2024-09-11 11:44:26 -07:00
"""A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
def __init__(self, model_runner: "ModelRunner"):
# Parse args
2024-07-13 05:29:46 -07:00
self.model_runner = model_runner
self.graphs = {}
self.input_buffers = {}
self.output_buffers = {}
self.flashinfer_handlers = {}
self.graph_memory_pool = None
2024-09-11 11:44:26 -07:00
self.use_torch_compile = model_runner.server_args.enable_torch_compile
self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
# Batch sizes to capture
if self.model_runner.server_args.disable_cuda_graph_padding:
self.capture_bs = list(range(1, 32)) + [64, 128]
else:
self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if self.use_torch_compile else []
2024-07-13 05:29:46 -07:00
# Common inputs
2024-09-11 11:44:26 -07:00
self.max_bs = max(self.capture_bs)
2024-07-13 05:29:46 -07:00
self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
2024-07-13 23:39:37 -07:00
self.req_pool_indices = torch.zeros(
(self.max_bs,), dtype=torch.int32, device="cuda"
)
2024-08-14 03:25:38 -07:00
self.seq_lens = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
self.position_ids_offsets = torch.ones(
2024-07-13 23:39:37 -07:00
(self.max_bs,), dtype=torch.int32, device="cuda"
)
self.out_cache_loc = torch.zeros(
(self.max_bs,), dtype=torch.int32, device="cuda"
)
2024-07-13 05:29:46 -07:00
2024-09-11 11:44:26 -07:00
# Attention backend
self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
2024-09-11 11:44:26 -07:00
# Sampling info
2024-08-28 18:58:52 -07:00
vocab_size = model_runner.model_config.vocab_size
self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size)
2024-09-11 11:44:26 -07:00
if self.use_torch_compile:
set_torch_compile_config()
2024-09-11 11:44:26 -07:00
# Capture
try:
self.capture()
except RuntimeError as e:
raise Exception(
f"Capture cuda graph failed: {e}\n"
"Possible solutions:\n"
"1. disable cuda graph by --disable-cuda-graph\n"
"2. set --mem-fraction-static to a smaller value\n"
"3. disable torch compile by not using --enable-torch-compile\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
)
def can_run(self, batch_size: int):
2024-08-20 22:35:05 -07:00
if self.disable_padding:
return batch_size in self.graphs
else:
return batch_size <= self.max_bs
2024-07-13 05:29:46 -07:00
2024-09-11 11:44:26 -07:00
def capture(self):
2024-07-13 05:29:46 -07:00
with graph_capture() as graph_capture_context:
self.stream = graph_capture_context.stream
2024-09-11 11:44:26 -07:00
for bs in self.capture_bs:
2024-07-21 03:09:29 -07:00
with patch_model(
self.model_runner.model,
bs in self.compile_bs,
self.model_runner.tp_group,
) as forward:
(
graph,
output_buffers,
) = self.capture_one_batch_size(bs, forward)
self.graphs[bs] = graph
self.output_buffers[bs] = output_buffers
2024-07-13 05:29:46 -07:00
def capture_one_batch_size(self, bs: int, forward: Callable):
2024-07-13 05:29:46 -07:00
graph = torch.cuda.CUDAGraph()
stream = self.stream
# Common inputs
input_ids = self.input_ids[:bs]
req_pool_indices = self.req_pool_indices[:bs]
seq_lens = self.seq_lens[:bs]
position_ids_offsets = self.position_ids_offsets[:bs]
out_cache_loc = self.out_cache_loc[:bs]
2024-09-11 11:44:26 -07:00
# Attention backend
self.model_runner.attn_backend.capture_cuda_graph_init(
bs, req_pool_indices, seq_lens
2024-07-13 05:29:46 -07:00
)
# Run and capture
def run_once():
input_metadata = InputMetadata(
2024-07-13 05:29:46 -07:00
forward_mode=ForwardMode.DECODE,
2024-08-28 18:58:52 -07:00
sampling_info=self.sampling_info[:bs],
batch_size=bs,
2024-07-13 05:29:46 -07:00
req_pool_indices=req_pool_indices,
seq_lens=seq_lens,
req_to_token_pool=self.model_runner.req_to_token_pool,
token_to_kv_pool=self.model_runner.token_to_kv_pool,
2024-09-11 11:44:26 -07:00
attn_backend=self.model_runner.attn_backend,
2024-07-13 05:29:46 -07:00
out_cache_loc=out_cache_loc,
return_logprob=False,
top_logprobs_nums=0,
2024-08-14 03:25:38 -07:00
positions=(seq_lens - 1 + position_ids_offsets).to(torch.int64),
2024-07-13 05:29:46 -07:00
)
2024-07-20 18:34:37 -07:00
return forward(input_ids, input_metadata.positions, input_metadata)
2024-07-13 05:29:46 -07:00
for _ in range(2):
torch.cuda.synchronize()
self.model_runner.tp_group.barrier()
2024-07-13 05:29:46 -07:00
run_once()
torch.cuda.synchronize()
self.model_runner.tp_group.barrier()
2024-07-13 05:29:46 -07:00
torch.cuda.synchronize()
self.model_runner.tp_group.barrier()
2024-07-13 05:29:46 -07:00
with torch.cuda.graph(graph, pool=self.graph_memory_pool, stream=stream):
out = run_once()
2024-07-13 05:29:46 -07:00
torch.cuda.synchronize()
self.model_runner.tp_group.barrier()
2024-07-13 05:29:46 -07:00
self.graph_memory_pool = graph.pool()
2024-09-11 11:44:26 -07:00
return graph, out
2024-07-13 05:29:46 -07:00
def replay(self, batch: ScheduleBatch):
2024-07-13 05:29:46 -07:00
assert batch.out_cache_loc is not None
raw_bs = len(batch.reqs)
# Pad
2024-09-11 11:44:26 -07:00
index = bisect.bisect_left(self.capture_bs, raw_bs)
bs = self.capture_bs[index]
2024-07-13 05:29:46 -07:00
if bs != raw_bs:
2024-08-14 03:25:38 -07:00
self.seq_lens.zero_()
self.position_ids_offsets.fill_(1)
2024-07-13 05:29:46 -07:00
self.out_cache_loc.zero_()
# Common inputs
self.input_ids[:raw_bs] = batch.input_ids
self.req_pool_indices[:raw_bs] = batch.req_pool_indices
self.seq_lens[:raw_bs] = batch.seq_lens
self.position_ids_offsets[:raw_bs] = batch.position_ids_offsets
self.out_cache_loc[:raw_bs] = batch.out_cache_loc
2024-09-11 11:44:26 -07:00
# Attention backend
self.model_runner.attn_backend.replay_cuda_graph_init(
bs, self.req_pool_indices, self.seq_lens
2024-07-13 05:29:46 -07:00
)
2024-08-28 18:58:52 -07:00
# Sampling inputs
self.sampling_info.inplace_assign(raw_bs, batch.sampling_info)
2024-07-13 05:29:46 -07:00
# Replay
torch.cuda.synchronize()
2024-07-13 05:29:46 -07:00
self.graphs[bs].replay()
torch.cuda.synchronize()
2024-08-28 18:58:52 -07:00
sample_output, logits_output = self.output_buffers[bs]
2024-07-13 05:29:46 -07:00
# Unpad
if bs != raw_bs:
2024-08-28 18:58:52 -07:00
logits_output = LogitsProcessorOutput(
next_token_logits=logits_output.next_token_logits[:raw_bs],
next_token_logprobs=None,
2024-07-13 05:29:46 -07:00
normalized_prompt_logprobs=None,
input_token_logprobs=None,
input_top_logprobs=None,
output_top_logprobs=None,
2024-07-13 05:29:46 -07:00
)
2024-08-28 18:58:52 -07:00
sample_output = SampleOutput(
sample_output.success[:raw_bs],
sample_output.probs[:raw_bs],
sample_output.batch_next_token_ids[:raw_bs],
)
# Extract logprobs
if batch.return_logprob:
2024-08-28 18:58:52 -07:00
logits_output.next_token_logprobs = torch.nn.functional.log_softmax(
logits_output.next_token_logits, dim=-1
)
return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums)
if return_top_logprob:
logits_metadata = LogitsMetadata(
forward_mode=ForwardMode.DECODE,
top_logprobs_nums=batch.top_logprobs_nums,
)
2024-08-28 18:58:52 -07:00
logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
logits_output.next_token_logprobs, logits_metadata
)[1]
2024-08-28 18:58:52 -07:00
return sample_output, logits_output