[Fix] Adjust default chunked prefill size and cuda graph max bs according to GPU memory capacity (#2044)

This commit is contained in:
Lianmin Zheng
2024-11-15 06:21:57 -08:00
committed by GitHub
parent c29b98e043
commit b01df48cf2
2 changed files with 50 additions and 3 deletions

View File

@@ -22,7 +22,12 @@ import random
import tempfile
from typing import List, Optional
from sglang.srt.utils import is_flashinfer_available, is_ipv6, is_port_available
from sglang.srt.utils import (
get_gpu_memory_capacity,
is_flashinfer_available,
is_ipv6,
is_port_available,
)
logger = logging.getLogger(__name__)
@@ -143,6 +148,9 @@ class ServerArgs:
# Disable chunked prefill
self.chunked_prefill_size = None
if self.random_seed is None:
self.random_seed = random.randint(0, 1 << 30)
# Mem fraction depends on the tensor parallelism size
if self.mem_fraction_static is None:
if self.tp_size >= 16:
@@ -156,8 +164,14 @@ class ServerArgs:
else:
self.mem_fraction_static = 0.88
if self.random_seed is None:
self.random_seed = random.randint(0, 1 << 30)
# Adjust for GPUs with small memory capacities
gpu_mem = get_gpu_memory_capacity()
if gpu_mem < 25000:
logger.warning(
"Automatically adjust --chunked-prefill-size for small GPUs."
)
self.chunked_prefill_size //= 4 # make it 2048
self.cuda_graph_max_bs = 4
# Deprecation warnings
if self.disable_flashinfer: