Overlapped weight offload (#8034)

This commit is contained in:
fzyzcjy
2025-08-23 17:06:46 +08:00
committed by GitHub
parent ccd3fb946e
commit 2600fc0d47
9 changed files with 584 additions and 10 deletions

View File

@@ -85,7 +85,6 @@ class ServerArgs:
max_prefill_tokens: int = 16384
schedule_policy: str = "fcfs"
schedule_conservativeness: float = 1.0
cpu_offload_gb: int = 0
page_size: Optional[int] = None
hybrid_kvcache_ratio: Optional[float] = None
swa_full_tokens_ratio: float = 0.8
@@ -226,6 +225,13 @@ class ServerArgs:
ds_heavy_channel_type: str = "qk"
ds_sparse_decode_threshold: int = 4096
# Offloading
cpu_offload_gb: int = 0
offload_group_size: int = -1
offload_num_in_group: int = 1
offload_prefetch_step: int = 1
offload_mode: str = "cpu"
# Optimization/debug options
disable_radix_cache: bool = False
cuda_graph_max_bs: Optional[int] = None
@@ -976,12 +982,6 @@ class ServerArgs:
default=ServerArgs.schedule_conservativeness,
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
)
parser.add_argument(
"--cpu-offload-gb",
type=int,
default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading.",
)
parser.add_argument(
"--page-size",
type=int,
@@ -1683,6 +1683,38 @@ class ServerArgs:
help="The type of heavy channels in double sparsity attention",
)
# Offloading
parser.add_argument(
"--cpu-offload-gb",
type=int,
default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading.",
)
parser.add_argument(
"--offload-group-size",
type=int,
default=ServerArgs.offload_group_size,
help="Number of layers per group in offloading.",
)
parser.add_argument(
"--offload-num-in-group",
type=int,
default=ServerArgs.offload_num_in_group,
help="Number of layers to be offloaded within a group.",
)
parser.add_argument(
"--offload-prefetch-step",
type=int,
default=ServerArgs.offload_prefetch_step,
help="Steps to prefetch in offloading.",
)
parser.add_argument(
"--offload-mode",
type=str,
default=ServerArgs.offload_mode,
help="Mode of offloading.",
)
# Optimization/debug options
parser.add_argument(
"--disable-radix-cache",