Support EAGLE draft extend CUDA graph (#6606)
Co-authored-by: Sehoon Kim <sehoonkim@berkeley.edu>
This commit is contained in:
@@ -262,10 +262,14 @@ class ServerArgs:
|
||||
self.mem_fraction_static = 0.88
|
||||
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
||||
mem_fraction = self.mem_fraction_static
|
||||
# 15 GB + additional 3GB for cuda graph
|
||||
reserve_mem = 1024 * 18
|
||||
# need reserve more memory for spec cuda graph
|
||||
if self.speculative_algorithm is not None:
|
||||
reserve_mem = 1024 * 20
|
||||
self.mem_fraction_static = min(
|
||||
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
||||
(gpu_mem - 1024 * 18)
|
||||
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
||||
(gpu_mem - reserve_mem) / gpu_mem,
|
||||
)
|
||||
|
||||
# Set chunked prefill size, which depends on the gpu memory capacity
|
||||
|
||||
Reference in New Issue
Block a user