[Hack] Add pd-disaggregation decode polling interval (#10411)
This commit is contained in:
@@ -394,6 +394,9 @@ class ServerArgs:
|
||||
disaggregation_ib_device: Optional[str] = None
|
||||
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
||||
|
||||
# FIXME: hack to reduce ITL when decode bs is small
|
||||
disaggregation_decode_polling_interval: int = 1
|
||||
|
||||
# For model weight update
|
||||
custom_weight_loader: Optional[List[str]] = None
|
||||
weight_loader_disable_mmap: bool = False
|
||||
@@ -2245,6 +2248,12 @@ class ServerArgs:
|
||||
default=ServerArgs.num_reserved_decode_tokens,
|
||||
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disaggregation-decode-polling-interval",
|
||||
type=int,
|
||||
default=ServerArgs.disaggregation_decode_polling_interval,
|
||||
help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
|
||||
)
|
||||
|
||||
# Custom weight loader
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user