Move output processing logic from scheduler.py into a separate file (#4354)

2025-03-12 16:21:49 -07:00
parent 2c3656f276
commit e35a93fa8a
6 changed files with 634 additions and 609 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -20,14 +20,13 @@ import random
 import tempfile
 from typing import List, Optional

-import torch
-
 from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
    get_amdgpu_memory_capacity,
    get_hpu_memory_capacity,
    get_nvgpu_memory_capacity,
+    is_cuda,
    is_flashinfer_available,
    is_hip,
    is_port_available,
@@ -71,6 +70,7 @@ class ServerArgs:
    schedule_policy: str = "fcfs"
    schedule_conservativeness: float = 1.0
    cpu_offload_gb: int = 0
+    page_size: int = 1

    # Other runtime options
    tp_size: int = 1
@@ -190,10 +190,10 @@ class ServerArgs:
        if self.random_seed is None:
            self.random_seed = random.randint(0, 1 << 30)

-        if is_hip():
-            gpu_mem = get_amdgpu_memory_capacity()
-        elif torch.cuda.is_available():
+        if is_cuda():
            gpu_mem = get_nvgpu_memory_capacity()
+        elif is_hip():
+            gpu_mem = get_amdgpu_memory_capacity()
        elif self.device == "hpu":
            gpu_mem = get_hpu_memory_capacity()
        else:
@@ -258,7 +258,7 @@ class ServerArgs:
                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
            )

-        # Others
+        # Data parallelism attention
        if self.enable_dp_attention:
            self.dp_size = self.tp_size
            assert self.tp_size % self.dp_size == 0
@@ -507,6 +507,12 @@ class ServerArgs:
            default=ServerArgs.cpu_offload_gb,
            help="How many GBs of RAM to reserve for CPU offloading.",
        )
+        parser.add_argument(
+            "--page-size",
+            type=int,
+            default=ServerArgs.page_size,
+            help="The number of tokens in a page.",
+        )

        # Other runtime options
        parser.add_argument(