Unify the memory pool api and tp worker API (#1724)

2024-10-19 23:19:26 -07:00
parent 95946271af
commit 59cbf47626
8 changed files with 87 additions and 25 deletions
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -25,6 +25,8 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 - ScheduleBatch is managed by `scheduler.py::Scheduler`.
  It contains high-level scheduling data. Most of the data is on the CPU.
 - ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
 - ForwardBatch is managed by `model_runner.py::ModelRunner`.
  It contains low-level tensor data. Most of the data consists of GPU tensors.
 """
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -131,6 +131,13 @@ class ModelRunner:
            ]:
                server_args.disable_cuda_graph = True

+        if self.server_args.enable_overlap_schedule:
+            logger.warning(
+                "Overlap scheduler is enabled. This is an experimental feature. "
+                "Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
+                "and embedding APIs are not supported and will lead to wrong results."
+            )
+
        # Global vars
        if server_args.show_time_cost:
            enable_show_time_cost()