Support cuda graph in the triton attention backend (#1401)

This commit is contained in:
Lianmin Zheng
2024-09-12 00:36:55 -07:00
committed by GitHub
parent 2a71be5e25
commit 3efa798116
6 changed files with 147 additions and 60 deletions

View File

@@ -445,12 +445,6 @@ class ModelRunner:
if self.server_args.disable_cuda_graph:
return
if self.server_args.attention_backend != "flashinfer":
logger.warning(
f"Cuda graph is not supported for attention backend: {self.server_args.attention_backend}"
)
return
logger.info("Capture cuda graph begin. This can take up to several minutes.")
self.cuda_graph_runner = CudaGraphRunner(self)