[code move] move pp into a separate mixin (#11838)

2025-10-20 18:46:56 -07:00
parent 1111030395
commit 01f14a7ad2
5 changed files with 425 additions and 422 deletions
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -53,13 +53,7 @@ from sglang.srt.mem_cache.memory_pool import (
    NSATokenToKVPool,
    SWAKVPool,
 )
-from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
-from sglang.srt.utils import (
-    DynamicGradMode,
-    broadcast_pyobj,
-    point_to_point_pyobj,
-    require_mlp_sync,
-)
+from sglang.srt.utils import broadcast_pyobj, point_to_point_pyobj, require_mlp_sync

 if TYPE_CHECKING:
    from torch.distributed import ProcessGroup
@@ -685,218 +679,6 @@ class SchedulerDisaggregationPrefillMixin:
            return
        req.disagg_kv_sender.send(page_indices, state_indices)

-    # PP
-    @DynamicGradMode()
-    def event_loop_pp_disagg_prefill(self: Scheduler):
-        """
-        An event loop for the prefill server in pipeline parallelism.
-
-        Rules:
-        1. Each stage runs in the same order and is notified by the previous stage.
-        2. Each send/recv operation is blocking and matched by the neighboring stage.
-
-        Regular Schedule:
-        ====================================================================
-        Stage i                   | Stage i+1
-        send ith req              | recv ith req
-        send ith proxy            | recv ith proxy
-        send prev (i+1)th carry   | recv prev (i+1)th carry
-        ====================================================================
-
-        Prefill Server Schedule:
-        ====================================================================
-        Stage i                        | Stage i+1
-        send ith req                   | recv ith req
-        send ith bootstrap req         | recv ith bootstrap req
-        send ith transferred req       | recv ith transferred req
-        send ith proxy                 | recv ith proxy
-        send prev (i+1)th carry        | recv prev (i+1)th carry
-        send prev (i+1)th release req  | recv prev (i+1)th release req
-        ====================================================================
-
-        There are two additional elements compared to the regular schedule:
-
-        1. Bootstrap Requests:
-            a. Instead of polling the status on the current workers, we should wait for the previous stage to notify to avoid desynchronization.
-            b. The first stage polls the status and propagates the bootstrapped requests down to all other stages.
-            c. If the first stage polls successfully, by nature, other ranks are also successful because they performed a handshake together.
-
-        2. Transferred Requests + Release Requests:
-            a. The first stage polls the transfer finished requests, performs an intersection with the next stage's finished requests, and propagates down to the last stage.
-            b. The last stage receives the requests that have finished transfer on all stages (consensus), then sends them to the first stage to release the memory.
-            c. The first stage receives the release requests, releases the memory, and then propagates the release requests down to the last stage.
-        """
-        from sglang.srt.managers.scheduler import GenerationBatchResult
-
-        mbs = [None] * self.pp_size
-        last_mbs = [None] * self.pp_size
-        self.running_mbs = [
-            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
-        ]
-        pp_outputs: Optional[PPProxyTensors] = None
-
-        # Either success or failed
-        bootstrapped_rids: List[str] = []
-        transferred_rids: List[str] = []
-        release_rids: Optional[List[str]] = None
-
-        # transferred microbatch
-        tmbs = [None] * self.pp_size
-
-        ENABLE_RELEASE = True  # For debug
-
-        while True:
-            server_is_idle = True
-
-            for mb_id in range(self.pp_size):
-                self.running_batch = self.running_mbs[mb_id]
-                self.last_batch = last_mbs[mb_id]
-
-                recv_reqs = self.recv_requests()
-
-                self.process_input_requests(recv_reqs)
-
-                if self.pp_group.is_first_rank:
-                    # First rank, pop the bootstrap reqs from the bootstrap queue
-                    bootstrapped_reqs, failed_reqs = (
-                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
-                            return_failed_reqs=True
-                        )
-                    )
-                    bootstrapped_rids = [req.rid for req in bootstrapped_reqs] + [
-                        req.rid for req in failed_reqs
-                    ]
-                    self.waiting_queue.extend(bootstrapped_reqs)
-                else:
-                    # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
-                    bootstrapped_rids = self.recv_pyobj_from_prev_stage()
-                    bootstrapped_reqs = (
-                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
-                            rids_to_check=bootstrapped_rids
-                        )
-                    )
-                    self.waiting_queue.extend(bootstrapped_reqs)
-
-                if self.pp_group.is_first_rank:
-                    transferred_rids = self.get_transferred_rids()
-                # if other ranks,
-                else:
-                    # 1. recv previous stage's transferred reqs info
-                    prev_transferred_rids = self.recv_pyobj_from_prev_stage()
-                    # 2. get the current stage's transferred reqs info
-                    curr_transferred_rids = self.get_transferred_rids()
-                    # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
-                    transferred_rids = list(
-                        set(prev_transferred_rids) & set(curr_transferred_rids)
-                    )
-
-                tmbs[mb_id] = transferred_rids
-
-                self.process_prefill_chunk()
-                mbs[mb_id] = self.get_new_batch_prefill()
-                self.running_mbs[mb_id] = self.running_batch
-
-                self.cur_batch = mbs[mb_id]
-                if self.cur_batch:
-                    server_is_idle = False
-                    result = self.run_batch(self.cur_batch)
-
-                # send the outputs to the next step
-                if self.pp_group.is_last_rank:
-                    if self.cur_batch:
-                        next_token_ids = result.next_token_ids
-                        pp_outputs = PPProxyTensors(
-                            {
-                                "next_token_ids": next_token_ids,
-                            }
-                        )
-                        # send the output from the last round to let the next stage worker run post processing
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-
-                if ENABLE_RELEASE:
-                    if self.pp_group.is_last_rank:
-                        # At the last stage, all stages has reached the consensus to release memory for transferred_rids
-                        release_rids = transferred_rids
-                        # send to the first rank
-                        self.send_pyobj_to_next_stage(release_rids)
-
-                # receive outputs and post-process (filter finished reqs) the coming microbatch
-                next_mb_id = (mb_id + 1) % self.pp_size
-                next_pp_outputs = None
-                next_release_rids = None
-
-                if mbs[next_mb_id] is not None:
-                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
-                        self.pp_group.recv_tensor_dict(
-                            all_gather_group=self.attn_tp_group
-                        )
-                    )
-                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
-                    output_result = GenerationBatchResult(
-                        logits_output=None,
-                        pp_hidden_states_proxy_tensors=None,
-                        next_token_ids=next_pp_outputs["next_token_ids"],
-                        extend_input_len_per_req=None,
-                        extend_logprob_start_len_per_req=None,
-                        can_run_cuda_graph=result.can_run_cuda_graph,
-                    )
-                    self.process_batch_result_disagg_prefill(
-                        mbs[next_mb_id], output_result
-                    )
-
-                    last_mbs[next_mb_id] = mbs[next_mb_id]
-
-                if ENABLE_RELEASE:
-                    if tmbs[next_mb_id] is not None:
-                        # recv consensus rids from the previous rank
-                        next_release_rids = self.recv_pyobj_from_prev_stage()
-                        self.process_disagg_prefill_inflight_queue(next_release_rids)
-
-                # carry the outputs to the next stage
-                if not self.pp_group.is_last_rank:
-                    if pp_outputs:
-                        # send the outputs from the last round to let the next stage worker run post processing
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-                    if ENABLE_RELEASE:
-                        if release_rids is not None:
-                            self.send_pyobj_to_next_stage(release_rids)
-
-                if not self.pp_group.is_last_rank:
-                    # send out reqs to the next stage
-                    self.send_pyobj_to_next_stage(recv_reqs)
-                    self.send_pyobj_to_next_stage(bootstrapped_rids)
-                    self.send_pyobj_to_next_stage(transferred_rids)
-
-                    # send out proxy tensors to the next stage
-                    if self.cur_batch:
-                        # FIXME(lsyin): remove this assert
-                        assert result.pp_hidden_states_proxy_tensors.tensors is not None
-                        self.pp_group.send_tensor_dict(
-                            result.pp_hidden_states_proxy_tensors.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-
-                pp_outputs = next_pp_outputs
-                release_rids = next_release_rids
-
-                self.running_batch.batch_is_full = False
-
-            if not ENABLE_RELEASE:
-                if len(self.disagg_prefill_inflight_queue) > 0:
-                    self.process_disagg_prefill_inflight_queue()
-
-            # When the server is idle, self-check and re-init some states
-            if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.check_tree_cache()
-                self.new_token_ratio = self.init_new_token_ratio
-
    def send_pyobj_to_next_stage(self, data):
        if self.attn_tp_rank == 0:
            dp_offset = self.attn_dp_rank * self.attn_tp_size
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -78,6 +78,7 @@ from sglang.srt.utils import flatten_nested_list

 if TYPE_CHECKING:
    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.speculative.eagle_info import EagleDraftInput
    from sglang.srt.speculative.spec_info import SpecInput, SpeculativeAlgorithm

 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -1527,8 +1528,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):

        if self.is_v2_eagle:
            # TODO(spec-v2): all v2 spec should go through this path
-            from sglang.srt.speculative.eagle_info import EagleDraftInput
-
            draft_input: EagleDraftInput = self.spec_info
            draft_input.prepare_for_decode(self)

@@ -1585,8 +1584,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):

    def maybe_wait_verify_done(self):
        if self.is_v2_eagle:
-            from sglang.srt.speculative.eagle_info import EagleDraftInput
-
            draft_input: EagleDraftInput = self.spec_info
            if draft_input.verify_done is not None:
                draft_input.verify_done.synchronize()
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -63,7 +63,6 @@ from sglang.srt.distributed import get_pp_group, get_world_group
 from sglang.srt.environ import envs
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
-from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe import initialize_moe_config
 from sglang.srt.managers.io_struct import (
    AbortReq,
@@ -114,7 +113,7 @@ from sglang.srt.managers.io_struct import (
    UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.mm_utils import init_embedding_cache
-from sglang.srt.managers.overlap_utils import FutureIndices, FutureMap
+from sglang.srt.managers.overlap_utils import FutureMap
 from sglang.srt.managers.schedule_batch import (
    FINISH_ABORT,
    ModelWorkerBatch,
@@ -136,22 +135,21 @@ from sglang.srt.managers.scheduler_metrics_mixin import (
 from sglang.srt.managers.scheduler_output_processor_mixin import (
    SchedulerOutputProcessorMixin,
 )
+from sglang.srt.managers.scheduler_pp_mixin import SchedulerPPMixin
 from sglang.srt.managers.scheduler_profiler_mixin import SchedulerProfilerMixin
 from sglang.srt.managers.scheduler_recv_skipper import SchedulerRecvSkipper
 from sglang.srt.managers.scheduler_update_weights_mixin import (
    SchedulerUpdateWeightsMixin,
 )
 from sglang.srt.managers.session_controller import Session
-from sglang.srt.managers.utils import validate_input_length
+from sglang.srt.managers.utils import GenerationBatchResult, validate_input_length
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
 from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
-from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
 from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.server_args import PortArgs, ServerArgs, get_global_server_args
-from sglang.srt.speculative.eagle_info import EagleDraftInput
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.tracing.trace import (
    process_tracing_init,
@@ -198,77 +196,6 @@ TEST_RETRACT = get_bool_env_var("SGLANG_TEST_RETRACT")
 GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))


-@dataclass
-class GenerationBatchResult:
-    logits_output: Optional[LogitsProcessorOutput] = None
-    pp_hidden_states_proxy_tensors: Optional[PPProxyTensors] = None
-    next_token_ids: Optional[torch.Tensor] = None
-    num_accepted_tokens: Optional[int] = None
-    can_run_cuda_graph: bool = False
-
-    # For output processing
-    extend_input_len_per_req: Optional[List[int]] = None
-    extend_logprob_start_len_per_req: Optional[List[int]] = None
-
-    # For overlap scheduling
-    copy_done: Optional[torch.cuda.Event] = None
-    delay_sample_func: Optional[callable] = None
-    future_indices: Optional[FutureIndices] = None
-
-    # FIXME(lsyin): maybe move to a better place?
-    # sync path: forward stream -> output processor
-    accept_lens: Optional[torch.Tensor] = None
-    allocate_lens: Optional[torch.Tensor] = None
-
-    # relay path: forward stream -> next step forward
-    next_draft_input: Optional[EagleDraftInput] = None
-
-    def copy_to_cpu(self, return_logprob: bool = False):
-        """Copy tensors to CPU in overlap scheduling.
-        Only the tensors which are needed for processing results are copied,
-        e.g., next_token_ids, logits outputs
-        """
-        if return_logprob:
-            if self.logits_output.next_token_logits is not None:
-                self.logits_output.next_token_logits = (
-                    self.logits_output.next_token_logits.to("cpu", non_blocking=True)
-                )
-            if self.logits_output.input_token_logprobs is not None:
-                self.logits_output.input_token_logprobs = (
-                    self.logits_output.input_token_logprobs.to("cpu", non_blocking=True)
-                )
-        if self.logits_output.hidden_states is not None:
-            self.logits_output.hidden_states = self.logits_output.hidden_states.to(
-                "cpu", non_blocking=True
-            )
-        self.next_token_ids = self.next_token_ids.to("cpu", non_blocking=True)
-
-        if self.accept_lens is not None:
-            self.accept_lens = self.accept_lens.to("cpu", non_blocking=True)
-
-        if self.allocate_lens is not None:
-            self.allocate_lens = self.allocate_lens.to("cpu", non_blocking=True)
-
-        self.copy_done.record()
-
-    @classmethod
-    def from_pp_proxy(
-        cls, logits_output, next_pp_outputs: PPProxyTensors, can_run_cuda_graph
-    ):
-        # TODO(lsyin): refactor PP and avoid using dict
-        proxy_dict = next_pp_outputs.tensors
-        return cls(
-            logits_output=logits_output,
-            pp_hidden_states_proxy_tensors=None,
-            next_token_ids=next_pp_outputs["next_token_ids"],
-            extend_input_len_per_req=proxy_dict.get("extend_input_len_per_req", None),
-            extend_logprob_start_len_per_req=proxy_dict.get(
-                "extend_logprob_start_len_per_req", None
-            ),
-            can_run_cuda_graph=can_run_cuda_graph,
-        )
-
-
@dataclass
 class EmbeddingBatchResult:
    embeddings: torch.Tensor
@@ -281,6 +208,7 @@ class Scheduler(
    SchedulerMetricsMixin,
    SchedulerDisaggregationDecodeMixin,
    SchedulerDisaggregationPrefillMixin,
+    SchedulerPPMixin,
 ):
    """A scheduler that manages a tensor parallel GPU worker."""

@@ -1058,128 +986,6 @@ class Scheduler(
            self.launch_batch_sample_if_needed(batch_result)
            self.last_batch = batch

-    @DynamicGradMode()
-    def event_loop_pp(self):
-        """A non-overlap scheduler loop for pipeline parallelism."""
-        mbs = [None] * self.pp_size
-        last_mbs = [None] * self.pp_size
-        self.running_mbs = [
-            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
-        ]
-        pp_outputs: Optional[PPProxyTensors] = None
-        while True:
-            server_is_idle = True
-            for mb_id in range(self.pp_size):
-                self.running_batch = self.running_mbs[mb_id]
-                self.last_batch = last_mbs[mb_id]
-
-                recv_reqs = self.recv_requests()
-                self.process_input_requests(recv_reqs)
-                mbs[mb_id] = self.get_next_batch_to_run()
-                self.running_mbs[mb_id] = self.running_batch
-
-                self.cur_batch = mbs[mb_id]
-                if self.cur_batch:
-                    server_is_idle = False
-                    result = self.run_batch(self.cur_batch)
-
-                # (last rank) send the outputs to the next step
-                if self.pp_group.is_last_rank:
-                    if self.cur_batch:
-                        next_token_ids = result.next_token_ids
-                        if self.cur_batch.return_logprob:
-                            pp_outputs = PPProxyTensors(
-                                {
-                                    "next_token_ids": next_token_ids,
-                                    "extend_input_len_per_req": result.extend_input_len_per_req,
-                                    "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
-                                }
-                                | (
-                                    {
-                                        f"logits_output.{k}": v
-                                        for k, v in result.logits_output.__dict__.items()
-                                    }
-                                    if result.logits_output is not None
-                                    else {}
-                                )
-                            )
-                        else:
-                            pp_outputs = PPProxyTensors(
-                                {
-                                    "next_token_ids": next_token_ids,
-                                }
-                            )
-                        # send the output from the last round to let the next stage worker run post processing
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-
-                # receive outputs and post-process (filter finished reqs) the coming microbatch
-                next_mb_id = (mb_id + 1) % self.pp_size
-                next_pp_outputs = None
-                if mbs[next_mb_id] is not None:
-                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
-                        self.pp_group.recv_tensor_dict(
-                            all_gather_group=self.attn_tp_group
-                        )
-                    )
-                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
-                    logits_output_args = {
-                        k[len("logits_output.") :]: v
-                        for k, v in next_pp_outputs.tensors.items()
-                        if k.startswith("logits_output.")
-                    }
-                    if len(logits_output_args) > 0:
-                        logits_output = LogitsProcessorOutput(**logits_output_args)
-                    else:
-                        logits_output = None
-
-                    output_result = GenerationBatchResult.from_pp_proxy(
-                        logits_output=logits_output,
-                        next_pp_outputs=next_pp_outputs,
-                        can_run_cuda_graph=result.can_run_cuda_graph,
-                    )
-                    self.process_batch_result(mbs[next_mb_id], output_result)
-                    last_mbs[next_mb_id] = mbs[next_mb_id]
-
-                # (not last rank)
-                if not self.pp_group.is_last_rank:
-                    # carry the outputs to the next stage
-                    # send the outputs from the last round to let the next stage worker run post processing
-                    if pp_outputs:
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-
-                    # send out reqs to the next stage
-                    dp_offset = self.attn_dp_rank * self.attn_tp_size
-                    if self.attn_tp_rank == 0:
-                        point_to_point_pyobj(
-                            recv_reqs,
-                            self.pp_rank * self.tp_size + dp_offset,
-                            self.world_group.device_group,
-                            self.pp_rank * self.tp_size + dp_offset,
-                            (self.pp_rank + 1) * self.tp_size + dp_offset,
-                        )
-
-                    # send out proxy tensors to the next stage
-                    if self.cur_batch:
-                        # FIXME(lsyin): remove this assert
-                        assert result.pp_hidden_states_proxy_tensors.tensors is not None
-                        self.pp_group.send_tensor_dict(
-                            result.pp_hidden_states_proxy_tensors.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-
-                pp_outputs = next_pp_outputs
-
-            # When the server is idle, self-check and re-init some states
-            if server_is_idle:
-                # When the server is idle, do self-check and re-init some states
-                self.self_check_during_idle()
-
    def recv_requests(self) -> List[Req]:
        """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""

--- a/python/sglang/srt/managers/scheduler_pp_mixin.py
+++ b/python/sglang/srt/managers/scheduler_pp_mixin.py
@@ -0,0 +1,341 @@
+from typing import List, Optional
+
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.managers.utils import GenerationBatchResult
+from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
+from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj
+
+
+class SchedulerPPMixin:
+
+    @DynamicGradMode()
+    def event_loop_pp(self):
+        """A non-overlap scheduler loop for pipeline parallelism."""
+        mbs = [None] * self.pp_size
+        last_mbs = [None] * self.pp_size
+        self.running_mbs = [
+            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
+        ]
+        pp_outputs: Optional[PPProxyTensors] = None
+        while True:
+            server_is_idle = True
+            for mb_id in range(self.pp_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = last_mbs[mb_id]
+
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+                mbs[mb_id] = self.get_next_batch_to_run()
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch = mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    result = self.run_batch(self.cur_batch)
+
+                # (last rank) send the outputs to the next step
+                if self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        next_token_ids = result.next_token_ids
+                        if self.cur_batch.return_logprob:
+                            pp_outputs = PPProxyTensors(
+                                {
+                                    "next_token_ids": next_token_ids,
+                                    "extend_input_len_per_req": result.extend_input_len_per_req,
+                                    "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
+                                }
+                                | (
+                                    {
+                                        f"logits_output.{k}": v
+                                        for k, v in result.logits_output.__dict__.items()
+                                    }
+                                    if result.logits_output is not None
+                                    else {}
+                                )
+                            )
+                        else:
+                            pp_outputs = PPProxyTensors(
+                                {
+                                    "next_token_ids": next_token_ids,
+                                }
+                            )
+                        # send the output from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                # receive outputs and post-process (filter finished reqs) the coming microbatch
+                next_mb_id = (mb_id + 1) % self.pp_size
+                next_pp_outputs = None
+                if mbs[next_mb_id] is not None:
+                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
+                        self.pp_group.recv_tensor_dict(
+                            all_gather_group=self.attn_tp_group
+                        )
+                    )
+                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
+                    logits_output_args = {
+                        k[len("logits_output.") :]: v
+                        for k, v in next_pp_outputs.tensors.items()
+                        if k.startswith("logits_output.")
+                    }
+                    if len(logits_output_args) > 0:
+                        logits_output = LogitsProcessorOutput(**logits_output_args)
+                    else:
+                        logits_output = None
+
+                    output_result = GenerationBatchResult.from_pp_proxy(
+                        logits_output=logits_output,
+                        next_pp_outputs=next_pp_outputs,
+                        can_run_cuda_graph=result.can_run_cuda_graph,
+                    )
+                    self.process_batch_result(mbs[next_mb_id], output_result)
+                    last_mbs[next_mb_id] = mbs[next_mb_id]
+
+                # (not last rank)
+                if not self.pp_group.is_last_rank:
+                    # carry the outputs to the next stage
+                    # send the outputs from the last round to let the next stage worker run post processing
+                    if pp_outputs:
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                    # send out reqs to the next stage
+                    dp_offset = self.attn_dp_rank * self.attn_tp_size
+                    if self.attn_tp_rank == 0:
+                        point_to_point_pyobj(
+                            recv_reqs,
+                            self.pp_rank * self.tp_size + dp_offset,
+                            self.world_group.device_group,
+                            self.pp_rank * self.tp_size + dp_offset,
+                            (self.pp_rank + 1) * self.tp_size + dp_offset,
+                        )
+
+                    # send out proxy tensors to the next stage
+                    if self.cur_batch:
+                        # FIXME(lsyin): remove this assert
+                        assert result.pp_hidden_states_proxy_tensors.tensors is not None
+                        self.pp_group.send_tensor_dict(
+                            result.pp_hidden_states_proxy_tensors.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                pp_outputs = next_pp_outputs
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle:
+                # When the server is idle, do self-check and re-init some states
+                self.self_check_during_idle()
+
+    @DynamicGradMode()
+    def event_loop_pp_disagg_prefill(self):
+        """
+        An event loop for the prefill server in pipeline parallelism.
+
+        Rules:
+        1. Each stage runs in the same order and is notified by the previous stage.
+        2. Each send/recv operation is blocking and matched by the neighboring stage.
+
+        Regular Schedule:
+        ====================================================================
+        Stage i                   | Stage i+1
+        send ith req              | recv ith req
+        send ith proxy            | recv ith proxy
+        send prev (i+1)th carry   | recv prev (i+1)th carry
+        ====================================================================
+
+        Prefill Server Schedule:
+        ====================================================================
+        Stage i                        | Stage i+1
+        send ith req                   | recv ith req
+        send ith bootstrap req         | recv ith bootstrap req
+        send ith transferred req       | recv ith transferred req
+        send ith proxy                 | recv ith proxy
+        send prev (i+1)th carry        | recv prev (i+1)th carry
+        send prev (i+1)th release req  | recv prev (i+1)th release req
+        ====================================================================
+
+        There are two additional elements compared to the regular schedule:
+
+        1. Bootstrap Requests:
+            a. Instead of polling the status on the current workers, we should wait for the previous stage to notify to avoid desynchronization.
+            b. The first stage polls the status and propagates the bootstrapped requests down to all other stages.
+            c. If the first stage polls successfully, by nature, other ranks are also successful because they performed a handshake together.
+
+        2. Transferred Requests + Release Requests:
+            a. The first stage polls the transfer finished requests, performs an intersection with the next stage's finished requests, and propagates down to the last stage.
+            b. The last stage receives the requests that have finished transfer on all stages (consensus), then sends them to the first stage to release the memory.
+            c. The first stage receives the release requests, releases the memory, and then propagates the release requests down to the last stage.
+        """
+        mbs = [None] * self.pp_size
+        last_mbs = [None] * self.pp_size
+        self.running_mbs = [
+            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
+        ]
+        pp_outputs: Optional[PPProxyTensors] = None
+
+        # Either success or failed
+        bootstrapped_rids: List[str] = []
+        transferred_rids: List[str] = []
+        release_rids: Optional[List[str]] = None
+
+        # transferred microbatch
+        tmbs = [None] * self.pp_size
+
+        ENABLE_RELEASE = True  # For debug
+
+        while True:
+            server_is_idle = True
+
+            for mb_id in range(self.pp_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = last_mbs[mb_id]
+
+                recv_reqs = self.recv_requests()
+
+                self.process_input_requests(recv_reqs)
+
+                if self.pp_group.is_first_rank:
+                    # First rank, pop the bootstrap reqs from the bootstrap queue
+                    bootstrapped_reqs, failed_reqs = (
+                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
+                            return_failed_reqs=True
+                        )
+                    )
+                    bootstrapped_rids = [req.rid for req in bootstrapped_reqs] + [
+                        req.rid for req in failed_reqs
+                    ]
+                    self.waiting_queue.extend(bootstrapped_reqs)
+                else:
+                    # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
+                    bootstrapped_rids = self.recv_pyobj_from_prev_stage()
+                    bootstrapped_reqs = (
+                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
+                            rids_to_check=bootstrapped_rids
+                        )
+                    )
+                    self.waiting_queue.extend(bootstrapped_reqs)
+
+                if self.pp_group.is_first_rank:
+                    transferred_rids = self.get_transferred_rids()
+                # if other ranks,
+                else:
+                    # 1. recv previous stage's transferred reqs info
+                    prev_transferred_rids = self.recv_pyobj_from_prev_stage()
+                    # 2. get the current stage's transferred reqs info
+                    curr_transferred_rids = self.get_transferred_rids()
+                    # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
+                    transferred_rids = list(
+                        set(prev_transferred_rids) & set(curr_transferred_rids)
+                    )
+
+                tmbs[mb_id] = transferred_rids
+
+                self.process_prefill_chunk()
+                mbs[mb_id] = self.get_new_batch_prefill()
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch = mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    result = self.run_batch(self.cur_batch)
+
+                # send the outputs to the next step
+                if self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        next_token_ids = result.next_token_ids
+                        pp_outputs = PPProxyTensors(
+                            {
+                                "next_token_ids": next_token_ids,
+                            }
+                        )
+                        # send the output from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                if ENABLE_RELEASE:
+                    if self.pp_group.is_last_rank:
+                        # At the last stage, all stages has reached the consensus to release memory for transferred_rids
+                        release_rids = transferred_rids
+                        # send to the first rank
+                        self.send_pyobj_to_next_stage(release_rids)
+
+                # receive outputs and post-process (filter finished reqs) the coming microbatch
+                next_mb_id = (mb_id + 1) % self.pp_size
+                next_pp_outputs = None
+                next_release_rids = None
+
+                if mbs[next_mb_id] is not None:
+                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
+                        self.pp_group.recv_tensor_dict(
+                            all_gather_group=self.attn_tp_group
+                        )
+                    )
+                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
+                    output_result = GenerationBatchResult(
+                        logits_output=None,
+                        pp_hidden_states_proxy_tensors=None,
+                        next_token_ids=next_pp_outputs["next_token_ids"],
+                        extend_input_len_per_req=None,
+                        extend_logprob_start_len_per_req=None,
+                        can_run_cuda_graph=result.can_run_cuda_graph,
+                    )
+                    self.process_batch_result_disagg_prefill(
+                        mbs[next_mb_id], output_result
+                    )
+
+                    last_mbs[next_mb_id] = mbs[next_mb_id]
+
+                if ENABLE_RELEASE:
+                    if tmbs[next_mb_id] is not None:
+                        # recv consensus rids from the previous rank
+                        next_release_rids = self.recv_pyobj_from_prev_stage()
+                        self.process_disagg_prefill_inflight_queue(next_release_rids)
+
+                # carry the outputs to the next stage
+                if not self.pp_group.is_last_rank:
+                    if pp_outputs:
+                        # send the outputs from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+                    if ENABLE_RELEASE:
+                        if release_rids is not None:
+                            self.send_pyobj_to_next_stage(release_rids)
+
+                if not self.pp_group.is_last_rank:
+                    # send out reqs to the next stage
+                    self.send_pyobj_to_next_stage(recv_reqs)
+                    self.send_pyobj_to_next_stage(bootstrapped_rids)
+                    self.send_pyobj_to_next_stage(transferred_rids)
+
+                    # send out proxy tensors to the next stage
+                    if self.cur_batch:
+                        # FIXME(lsyin): remove this assert
+                        assert result.pp_hidden_states_proxy_tensors.tensors is not None
+                        self.pp_group.send_tensor_dict(
+                            result.pp_hidden_states_proxy_tensors.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                pp_outputs = next_pp_outputs
+                release_rids = next_release_rids
+
+                self.running_batch.batch_is_full = False
+
+            if not ENABLE_RELEASE:
+                if len(self.disagg_prefill_inflight_queue) > 0:
+                    self.process_disagg_prefill_inflight_queue()
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
+                self.check_memory()
+                self.check_tree_cache()
+                self.new_token_ratio = self.init_new_token_ratio
--- a/python/sglang/srt/managers/utils.py
+++ b/python/sglang/srt/managers/utils.py
@@ -1,18 +1,95 @@
 from __future__ import annotations

+import dataclasses
 import logging
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
+
+import torch

 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.overlap_utils import FutureIndices
 from sglang.srt.managers.schedule_batch import Req
 from sglang.srt.model_executor.forward_batch_info import PPProxyTensors

 if TYPE_CHECKING:
    from sglang.srt.managers.scheduler import GenerationBatchResult
+    from sglang.srt.speculative.eagle_info import EagleDraftInput
+

 logger = logging.getLogger(__name__)


+@dataclasses.dataclass
+class GenerationBatchResult:
+    logits_output: Optional[LogitsProcessorOutput] = None
+    pp_hidden_states_proxy_tensors: Optional[PPProxyTensors] = None
+    next_token_ids: Optional[torch.Tensor] = None
+    num_accepted_tokens: Optional[int] = None
+    can_run_cuda_graph: bool = False
+
+    # For output processing
+    extend_input_len_per_req: Optional[List[int]] = None
+    extend_logprob_start_len_per_req: Optional[List[int]] = None
+
+    # For overlap scheduling
+    copy_done: Optional[torch.cuda.Event] = None
+    delay_sample_func: Optional[callable] = None
+    future_indices: Optional[FutureIndices] = None
+
+    # FIXME(lsyin): maybe move to a better place?
+    # sync path: forward stream -> output processor
+    accept_lens: Optional[torch.Tensor] = None
+    allocate_lens: Optional[torch.Tensor] = None
+
+    # relay path: forward stream -> next step forward
+    next_draft_input: Optional[EagleDraftInput] = None
+
+    def copy_to_cpu(self, return_logprob: bool = False):
+        """Copy tensors to CPU in overlap scheduling.
+        Only the tensors which are needed for processing results are copied,
+        e.g., next_token_ids, logits outputs
+        """
+        if return_logprob:
+            if self.logits_output.next_token_logits is not None:
+                self.logits_output.next_token_logits = (
+                    self.logits_output.next_token_logits.to("cpu", non_blocking=True)
+                )
+            if self.logits_output.input_token_logprobs is not None:
+                self.logits_output.input_token_logprobs = (
+                    self.logits_output.input_token_logprobs.to("cpu", non_blocking=True)
+                )
+        if self.logits_output.hidden_states is not None:
+            self.logits_output.hidden_states = self.logits_output.hidden_states.to(
+                "cpu", non_blocking=True
+            )
+        self.next_token_ids = self.next_token_ids.to("cpu", non_blocking=True)
+
+        if self.accept_lens is not None:
+            self.accept_lens = self.accept_lens.to("cpu", non_blocking=True)
+
+        if self.allocate_lens is not None:
+            self.allocate_lens = self.allocate_lens.to("cpu", non_blocking=True)
+
+        self.copy_done.record()
+
+    @classmethod
+    def from_pp_proxy(
+        cls, logits_output, next_pp_outputs: PPProxyTensors, can_run_cuda_graph
+    ):
+        # TODO(lsyin): refactor PP and avoid using dict
+        proxy_dict = next_pp_outputs.tensors
+        return cls(
+            logits_output=logits_output,
+            pp_hidden_states_proxy_tensors=None,
+            next_token_ids=next_pp_outputs["next_token_ids"],
+            extend_input_len_per_req=proxy_dict.get("extend_input_len_per_req", None),
+            extend_logprob_start_len_per_req=proxy_dict.get(
+                "extend_logprob_start_len_per_req", None
+            ),
+            can_run_cuda_graph=can_run_cuda_graph,
+        )
+
+
 def validate_input_length(
    req: Req, max_req_input_len: int, allow_auto_truncate: bool
 ) -> Optional[str]: