Unify the memory pool api and tp worker API (#1724)
This commit is contained in:
@@ -25,6 +25,8 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
|
||||
- ScheduleBatch is managed by `scheduler.py::Scheduler`.
|
||||
It contains high-level scheduling data. Most of the data is on the CPU.
|
||||
- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
|
||||
It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
|
||||
It will be transformed from CPU scheduler to GPU model runner.
|
||||
- ForwardBatch is managed by `model_runner.py::ModelRunner`.
|
||||
It contains low-level tensor data. Most of the data consists of GPU tensors.
|
||||
"""
|
||||
|
||||
@@ -131,6 +131,13 @@ class ModelRunner:
|
||||
]:
|
||||
server_args.disable_cuda_graph = True
|
||||
|
||||
if self.server_args.enable_overlap_schedule:
|
||||
logger.warning(
|
||||
"Overlap scheduler is enabled. This is an experimental feature. "
|
||||
"Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
|
||||
"and embedding APIs are not supported and will lead to wrong results."
|
||||
)
|
||||
|
||||
# Global vars
|
||||
if server_args.show_time_cost:
|
||||
enable_show_time_cost()
|
||||
|
||||
Reference in New Issue
Block a user