[fix] relax mem_fraction_static for h200 (#5893)
Co-authored-by: alcanerian <alcanerian@gmail.com>
This commit is contained in:
@@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|||||||
|
|
||||||
gpu_mem = get_device_memory_capacity()
|
gpu_mem = get_device_memory_capacity()
|
||||||
# Batch size of each rank will not become so large when DP is on
|
# Batch size of each rank will not become so large when DP is on
|
||||||
if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1:
|
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
||||||
capture_bs += list(range(160, 257, 8))
|
capture_bs += list(range(160, 257, 8))
|
||||||
|
|
||||||
if max(capture_bs) > model_runner.req_to_token_pool.size:
|
if max(capture_bs) > model_runner.req_to_token_pool.size:
|
||||||
|
|||||||
@@ -222,20 +222,23 @@ class ServerArgs:
|
|||||||
|
|
||||||
# Set mem fraction static, which depends on the tensor parallelism size
|
# Set mem fraction static, which depends on the tensor parallelism size
|
||||||
if self.mem_fraction_static is None:
|
if self.mem_fraction_static is None:
|
||||||
if gpu_mem <= 81920:
|
if self.tp_size >= 16:
|
||||||
if self.tp_size >= 16:
|
self.mem_fraction_static = 0.79
|
||||||
self.mem_fraction_static = 0.79
|
elif self.tp_size >= 8:
|
||||||
elif self.tp_size >= 8:
|
self.mem_fraction_static = 0.81
|
||||||
self.mem_fraction_static = 0.81
|
elif self.tp_size >= 4:
|
||||||
elif self.tp_size >= 4:
|
self.mem_fraction_static = 0.85
|
||||||
self.mem_fraction_static = 0.85
|
elif self.tp_size >= 2:
|
||||||
elif self.tp_size >= 2:
|
self.mem_fraction_static = 0.87
|
||||||
self.mem_fraction_static = 0.87
|
|
||||||
else:
|
|
||||||
self.mem_fraction_static = 0.88
|
|
||||||
else:
|
else:
|
||||||
# FIXME: more fine grained auto-selection polices
|
self.mem_fraction_static = 0.88
|
||||||
self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
|
if gpu_mem > 96 * 1024:
|
||||||
|
mem_fraction = self.mem_fraction_static
|
||||||
|
self.mem_fraction_static = min(
|
||||||
|
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
||||||
|
(gpu_mem - 1024 * 18)
|
||||||
|
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
||||||
|
)
|
||||||
|
|
||||||
# Set chunked prefill size, which depends on the gpu memory capacity
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
||||||
if self.chunked_prefill_size is None:
|
if self.chunked_prefill_size is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user