Fix the default chunked prefill size (#2268)
This commit is contained in:
@@ -253,6 +253,8 @@ class Scheduler:
|
|||||||
|
|
||||||
# Init chunked prefill
|
# Init chunked prefill
|
||||||
self.chunked_prefill_size = server_args.chunked_prefill_size
|
self.chunked_prefill_size = server_args.chunked_prefill_size
|
||||||
|
if self.chunked_prefill_size <= 0: # -1 means disable
|
||||||
|
self.chunked_prefill_size = None
|
||||||
self.being_chunked_req = None
|
self.being_chunked_req = None
|
||||||
self.is_mixed_chunk = (
|
self.is_mixed_chunk = (
|
||||||
self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
|
self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ class ModelRunner:
|
|||||||
logger.info(
|
logger.info(
|
||||||
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
||||||
)
|
)
|
||||||
server_args.chunked_prefill_size = None
|
server_args.chunked_prefill_size = -1
|
||||||
self.mem_fraction_static *= 0.95
|
self.mem_fraction_static *= 0.95
|
||||||
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
|
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
|
||||||
if self.model_config.hf_config.architectures == [
|
if self.model_config.hf_config.architectures == [
|
||||||
@@ -148,12 +148,14 @@ class ModelRunner:
|
|||||||
|
|
||||||
set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3))
|
set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3))
|
||||||
|
|
||||||
# Init components
|
# Get memory before model loading
|
||||||
min_per_gpu_memory = self.init_torch_distributed()
|
min_per_gpu_memory = self.init_torch_distributed()
|
||||||
|
|
||||||
|
# Load the model
|
||||||
self.sampler = Sampler()
|
self.sampler = Sampler()
|
||||||
self.load_model()
|
self.load_model()
|
||||||
|
|
||||||
# Apply torch TP if model supports it
|
# Apply torch TP if the model supports it
|
||||||
supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
|
supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
|
||||||
if self.tp_size > 1 and supports_torch_tp:
|
if self.tp_size > 1 and supports_torch_tp:
|
||||||
self.apply_torch_tp()
|
self.apply_torch_tp()
|
||||||
@@ -161,6 +163,7 @@ class ModelRunner:
|
|||||||
else:
|
else:
|
||||||
self.torch_tp_applied = False
|
self.torch_tp_applied = False
|
||||||
|
|
||||||
|
# Init memory pool and attention backends
|
||||||
if server_args.lora_paths is not None:
|
if server_args.lora_paths is not None:
|
||||||
self.init_lora_manager()
|
self.init_lora_manager()
|
||||||
self.init_memory_pool(
|
self.init_memory_pool(
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ class ServerArgs:
|
|||||||
mem_fraction_static: Optional[float] = None
|
mem_fraction_static: Optional[float] = None
|
||||||
max_running_requests: Optional[int] = None
|
max_running_requests: Optional[int] = None
|
||||||
max_total_tokens: Optional[int] = None
|
max_total_tokens: Optional[int] = None
|
||||||
chunked_prefill_size: int = 8192
|
chunked_prefill_size: Optional[int] = None
|
||||||
max_prefill_tokens: int = 16384
|
max_prefill_tokens: int = 16384
|
||||||
schedule_policy: str = "lpm"
|
schedule_policy: str = "lpm"
|
||||||
schedule_conservativeness: float = 1.0
|
schedule_conservativeness: float = 1.0
|
||||||
@@ -128,7 +128,7 @@ class ServerArgs:
|
|||||||
enable_dp_attention: bool = False
|
enable_dp_attention: bool = False
|
||||||
enable_torch_compile: bool = False
|
enable_torch_compile: bool = False
|
||||||
torch_compile_max_bs: int = 32
|
torch_compile_max_bs: int = 32
|
||||||
cuda_graph_max_bs: int = 160
|
cuda_graph_max_bs: Optional[int] = None
|
||||||
torchao_config: str = ""
|
torchao_config: str = ""
|
||||||
enable_nan_detection: bool = False
|
enable_nan_detection: bool = False
|
||||||
enable_p2p_check: bool = False
|
enable_p2p_check: bool = False
|
||||||
@@ -144,14 +144,15 @@ class ServerArgs:
|
|||||||
if self.served_model_name is None:
|
if self.served_model_name is None:
|
||||||
self.served_model_name = self.model_path
|
self.served_model_name = self.model_path
|
||||||
|
|
||||||
if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
|
|
||||||
# Disable chunked prefill
|
|
||||||
self.chunked_prefill_size = None
|
|
||||||
|
|
||||||
if self.random_seed is None:
|
if self.random_seed is None:
|
||||||
self.random_seed = random.randint(0, 1 << 30)
|
self.random_seed = random.randint(0, 1 << 30)
|
||||||
|
|
||||||
# Mem fraction depends on the tensor parallelism size
|
if is_hip():
|
||||||
|
gpu_mem = get_amdgpu_memory_capacity()
|
||||||
|
else:
|
||||||
|
gpu_mem = get_nvgpu_memory_capacity()
|
||||||
|
|
||||||
|
# Set mem fraction static, which depends on the tensor parallelism size
|
||||||
if self.mem_fraction_static is None:
|
if self.mem_fraction_static is None:
|
||||||
if self.tp_size >= 16:
|
if self.tp_size >= 16:
|
||||||
self.mem_fraction_static = 0.79
|
self.mem_fraction_static = 0.79
|
||||||
@@ -164,18 +165,21 @@ class ServerArgs:
|
|||||||
else:
|
else:
|
||||||
self.mem_fraction_static = 0.88
|
self.mem_fraction_static = 0.88
|
||||||
|
|
||||||
# Adjust for GPUs with small memory capacities
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
||||||
if is_hip():
|
if self.chunked_prefill_size is None:
|
||||||
gpu_mem = get_amdgpu_memory_capacity()
|
if gpu_mem < 25_000:
|
||||||
else:
|
self.chunked_prefill_size = 2048
|
||||||
gpu_mem = get_nvgpu_memory_capacity()
|
else:
|
||||||
|
self.chunked_prefill_size = 8192
|
||||||
|
|
||||||
if gpu_mem < 25000:
|
# Set cuda graph max batch size
|
||||||
logger.warning(
|
if self.cuda_graph_max_bs is None:
|
||||||
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
|
if gpu_mem < 25_000:
|
||||||
)
|
self.cuda_graph_max_bs = 8
|
||||||
|
else:
|
||||||
|
self.cuda_graph_max_bs = 160
|
||||||
|
|
||||||
# Choose kernel backends
|
# Set kernel backends
|
||||||
if not is_flashinfer_available():
|
if not is_flashinfer_available():
|
||||||
self.attention_backend = "triton"
|
self.attention_backend = "triton"
|
||||||
self.sampling_backend = "pytorch"
|
self.sampling_backend = "pytorch"
|
||||||
|
|||||||
Reference in New Issue
Block a user